<!DOCTYPE html>



  


<html class="theme-next pisces use-motion" lang="zh-Hans">
<head>
  <meta charset="UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1"/>
<meta name="theme-color" content="#222">


<meta name="google-site-verification" content="E9deYnivN5MuHMuIfiMZZfS0alv-d_0UjcwjBL79lGU" />



<meta name="baidu-site-verification" content="iHYWJxscwD" />










<meta http-equiv="Cache-Control" content="no-transform" />
<meta http-equiv="Cache-Control" content="no-siteapp" />



  <meta name="google-site-verification" content="true" />








  <meta name="baidu-site-verification" content="true" />







  
  
  <link href="/lib/fancybox/source/jquery.fancybox.css?v=2.1.5" rel="stylesheet" type="text/css" />







<link href="/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css" />

<link href="/css/main.css?v=5.1.4" rel="stylesheet" type="text/css" />


  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png?v=5.1.4">


  <link rel="mask-icon" href="/images/logo.svg?v=5.1.4" color="#222">





  <meta name="keywords" content="Python,量化投资,机器学习,kaggle,分类算法，支持向量机,实例," />










<meta name="description" content="《Python机器学习应用》的最后一部分内容强化学习是程序或智能体通过与环境不断地进行交互学习从环境到动作的映射，学习的目标就是使累计回报最大化。具体算法有马尔科夫决策，蒙特卡洛强化学习，Q-learning算法，深度强化学习，Deep Q Network。具体的例子是一个游戏，用到pygame，手机上用不了，就pass了。接下来就开始看书以及在问题中学吧。今天开始复工了，学习时间会少一些。拿个项">
<meta property="og:type" content="article">
<meta property="og:title" content="量化投资学习笔记38——机器学习实操">
<meta property="og:url" content="https://zwdnet.github.io/2020/03/22/%E9%87%8F%E5%8C%96%E6%8A%95%E8%B5%84%E5%AD%A6%E4%B9%A0%E7%AC%94%E8%AE%B038%E2%80%94%E2%80%94%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%AE%9E%E6%93%8D/index.html">
<meta property="og:site_name" content="赵瑜敏的口腔医学专业学习博客">
<meta property="og:description" content="《Python机器学习应用》的最后一部分内容强化学习是程序或智能体通过与环境不断地进行交互学习从环境到动作的映射，学习的目标就是使累计回报最大化。具体算法有马尔科夫决策，蒙特卡洛强化学习，Q-learning算法，深度强化学习，Deep Q Network。具体的例子是一个游戏，用到pygame，手机上用不了，就pass了。接下来就开始看书以及在问题中学吧。今天开始复工了，学习时间会少一些。拿个项">
<meta property="og:locale">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/01.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/02.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/03.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/04.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/05.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/06.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/07.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/08.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/09.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/10.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/11.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/12.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/13.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/14.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/15.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/16.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/17.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/18.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/19.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/20.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/21.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/22.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/23.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/24.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/25.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/26.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/27.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/28.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/29.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/30.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/31.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/32.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/33.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/34.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/35.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/36.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/37.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/38.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/39.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/40.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/41.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/42.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/43.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/44.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/45.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/46.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/47.png">
<meta property="og:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/other/wx.jpg">
<meta property="article:published_time" content="2020-03-22T05:43:59.000Z">
<meta property="article:modified_time" content="2020-08-30T05:52:04.000Z">
<meta property="article:author" content="赵瑜敏">
<meta property="article:tag" content="Python">
<meta property="article:tag" content="量化投资">
<meta property="article:tag" content="机器学习">
<meta property="article:tag" content="kaggle">
<meta property="article:tag" content="分类算法，支持向量机">
<meta property="article:tag" content="实例">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/01.png">



<script type="text/javascript" id="hexo.configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    root: '',
    scheme: 'Pisces',
    version: '5.1.4',
    sidebar: {"position":"left","display":"post","offset":12,"b2t":false,"scrollpercent":false,"onmobile":false},
    fancybox: true,
    tabs: true,
    motion: {"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
    duoshuo: {
      userId: '0',
      author: '博主'
    },
    algolia: {
      applicationID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    }
  };
</script>



  <link rel="canonical" href="https://zwdnet.github.io/2020/03/22/量化投资学习笔记38——机器学习实操/"/>





  <title>量化投资学习笔记38——机器学习实操 | 赵瑜敏的口腔医学专业学习博客</title>
  








<meta name="generator" content="Hexo 5.4.0"></head>

<body itemscope itemtype="http://schema.org/WebPage" lang="zh-Hans">

  
  
    
  

  <div class="container sidebar-position-left page-post-detail">
    <div class="headband"></div>

    <header id="header" class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-wrapper">
  <div class="site-meta ">
    

    <div class="custom-logo-site-title">
      <a href="/"  class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">赵瑜敏的口腔医学专业学习博客</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
      
        <p class="site-subtitle"></p>
      
  </div>

  <div class="site-nav-toggle">
    <button>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
    </button>
  </div>
</div>

<nav class="site-nav">
  

  
    <ul id="menu" class="menu">
      
        
        <li class="menu-item menu-item-home">
          <a href="/%20" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-home"></i> <br />
            
            首页
          </a>
        </li>
      
        
        <li class="menu-item menu-item-tags">
          <a href="/tags/%20" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-tags"></i> <br />
            
            标签
          </a>
        </li>
      
        
        <li class="menu-item menu-item-categories">
          <a href="/categories/%20" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-th"></i> <br />
            
            分类
          </a>
        </li>
      
        
        <li class="menu-item menu-item-archives">
          <a href="/archives/%20" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-archive"></i> <br />
            
            归档
          </a>
        </li>
      

      
    </ul>
  

  
</nav>



 </div>
    </header>

    <main id="main" class="main">
      <div class="main-inner">
        <div class="content-wrap">
          <div id="content" class="content">
            

  <div id="posts" class="posts-expand">
    

  

  
  
  

  <article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
  
  
  
  <div class="post-block">
    <link itemprop="mainEntityOfPage" href="https://zwdnet.github.io/2020/03/22/%E9%87%8F%E5%8C%96%E6%8A%95%E8%B5%84%E5%AD%A6%E4%B9%A0%E7%AC%94%E8%AE%B038%E2%80%94%E2%80%94%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%AE%9E%E6%93%8D/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="name" content="">
      <meta itemprop="description" content="">
      <meta itemprop="image" content="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/other/tx.jpg">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="赵瑜敏的口腔医学专业学习博客">
    </span>

    
      <header class="post-header">

        
        
          <h1 class="post-title" itemprop="name headline">量化投资学习笔记38——机器学习实操</h1>
        

        <div class="post-meta">
          <span class="post-time">
            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">发表于</span>
              
              <time title="创建于" itemprop="dateCreated datePublished" datetime="2020-03-22T05:43:59+00:00">
                2020-03-22
              </time>
            

            

            
          </span>

          
            <span class="post-category" >
            
              <span class="post-meta-divider">|</span>
            
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              
                <span class="post-meta-item-text">分类于</span>
              
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                  <a href="/categories/%E9%87%8F%E5%8C%96%E6%8A%95%E8%B5%84/" itemprop="url" rel="index">
                    <span itemprop="name">量化投资</span>
                  </a>
                </span>

                
                
              
            </span>
          

          
            
          

          
          

          

          
            <div class="post-wordcount">
              
                
                  <span class="post-meta-divider">|</span>
                
                <span class="post-meta-item-icon">
                  <i class="fa fa-file-word-o"></i>
                </span>
                
                  <span class="post-meta-item-text">字数统计&#58;</span>
                
                <span title="字数统计">
                  10.5k
                </span>
              

              
                <span class="post-meta-divider">|</span>
              

              
                <span class="post-meta-item-icon">
                  <i class="fa fa-clock-o"></i>
                </span>
                
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                
                <span title="阅读时长">
                  52
                </span>
              
            </div>
          

          

        </div>
      </header>
    

    
    
    
    <div class="post-body" itemprop="articleBody">

      
      

      
        <p>《Python机器学习应用》的最后一部分内容<br>强化学习<br>是程序或智能体通过与环境不断地进行交互学习从环境到动作的映射，学习的目标就是使累计回报最大化。<br>具体算法有马尔科夫决策，蒙特卡洛强化学习，Q-learning算法，深度强化学习，Deep Q Network。<br>具体的例子是一个游戏，用到pygame，手机上用不了，就pass了。<br>接下来就开始看书以及在问题中学吧。<br>今天开始复工了，学习时间会少一些。<br>拿个项目来实操一下机器学习的整个过程，就拿kaggle上面的泰坦尼克号来练手吧。<br>本文代码: <a target="_blank" rel="noopener" href="https://github.com/zwdnet/MyQuant/tree/master/38">https://github.com/zwdnet/MyQuant/tree/master/38</a><br>先照国外大神的文章来吧。[1]<br>这个kernel主要关注于特征工程。<br>背景就不介绍了<br>1.先加载数据。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 将训练数据和训练数据合并到一起</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">concat_df</span>(<span class="params">train_data, test_data</span>):</span></span><br><span class="line">    <span class="keyword">return</span> pd.concat([train_data, test_data], sort = <span class="literal">True</span>).reset_index(drop = <span class="literal">True</span>)</span><br><span class="line">    </span><br><span class="line">    </span><br><span class="line"><span class="comment"># 将数据集重新分割为训练集和测试集</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">divide_df</span>(<span class="params">all_data</span>):</span></span><br><span class="line">    <span class="keyword">return</span> all_data.loc[:<span class="number">890</span>], all_data.loc[<span class="number">891</span>:].drop([<span class="string">&quot;Survived&quot;</span>], axis = <span class="number">1</span>)</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="keyword">if</span> __name__ == <span class="string">&quot;__main__&quot;</span>:</span><br><span class="line">    <span class="comment"># 载入数据</span></span><br><span class="line">    df_train = pd.read_csv(<span class="string">&quot;./data/train.csv&quot;</span>)</span><br><span class="line">    df_test = pd.read_csv(<span class="string">&quot;./data/test.csv&quot;</span>)</span><br><span class="line">    df_all = concat_df(df_train, df_test)</span><br><span class="line">    </span><br><span class="line">    df_train.name = <span class="string">&quot;Training Set&quot;</span></span><br><span class="line">    df_test.name = <span class="string">&quot;Test Set&quot;</span></span><br><span class="line">    df_all.name = <span class="string">&quot;All Set&quot;</span></span><br><span class="line">    </span><br><span class="line">    dfs = [df_train, df_test]</span><br><span class="line">    </span><br><span class="line">    print(<span class="string">&quot;训练样本量 = &#123;&#125;&quot;</span>.<span class="built_in">format</span>(df_train.shape[<span class="number">0</span>]))</span><br><span class="line">    print(<span class="string">&quot;测试样本量 = &#123;&#125;&quot;</span>.<span class="built_in">format</span>(df_test.shape[<span class="number">0</span>]))</span><br><span class="line">    print(<span class="string">&quot;训练中的X的形状 = &#123;&#125;&quot;</span>.<span class="built_in">format</span>(df_train.shape))</span><br><span class="line">    print(<span class="string">&quot;训练中的y的形状 = &#123;&#125;&quot;</span>.<span class="built_in">format</span>(df_train[<span class="string">&quot;Survived&quot;</span>].shape[<span class="number">0</span>]))</span><br><span class="line">    print(<span class="string">&quot;测试中的X的形状 = &#123;&#125;&quot;</span>.<span class="built_in">format</span>(df_test.shape))</span><br><span class="line">    print(<span class="string">&quot;测试中的y的形状 = &#123;&#125;&quot;</span>.<span class="built_in">format</span>(df_test.shape[<span class="number">0</span>]))</span><br><span class="line">    print(df_train.columns)</span><br><span class="line">    print(df_test.columns)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/01.png"><br>将训练集和测试集合并到一个DataFrame里。<br>2.进行探索性数据分析<br>概览<br>具体数据列为:<br>PassengerId:乘客编号，每行唯一，对结果无影响。<br>Survived:乘客生存情况，我们要预测的值，有1或0两个情况，1是活，0是死。<br>Pclass:乘客等级，分三级，1是上层等级，2是中等阶级，3是下层阶级。<br>Name,Sex,Age不言自明。<br>SibSp:是乘客的同胞兄弟姐妹和配偶的总数。<br>Parch:乘客的父母和子女总数。<br>Ticket:乘客的票号。<br>Fare:乘客的票价。<br>Cabin:乘客的舱号。<br>Embarked:乘客登船港口，有C,Q,S三个。<br>查看数据情况。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 查看数据情况</span></span><br><span class="line">print(df_train.info())</span><br><span class="line">print(df_train.sample(<span class="number">3</span>))</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/02.png"></p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">print(df_test.info())</span><br><span class="line">print(df_test.sample(<span class="number">3</span>))</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/03.png"><br>Age,Fare,Embark,Cabin有缺失值。<br>处理缺失值<br>处理缺失值的时候将训练集和测试集的数据合并到一起很方便，不然填充的数据可能对训练集或测试集形成过拟合。Age,Embark和Fare缺失值的比例不高，而Cabin有80%的缺失值。前三者可以使用描述统计学方法填充缺失值，而Cabin不行。<br>处理年龄缺失值。<br>用年龄中位数。不是全部乘客的年龄中位数，而是根据乘客等级分类的年龄中位数。因为其与年龄和生存情况有很高的相关系数。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 计算年龄与其它特征的相关性</span></span><br><span class="line">df_all_corr = df_all.corr().<span class="built_in">abs</span>().unstack().sort_values(kind=<span class="string">&quot;quicksort&quot;</span>, ascending=<span class="literal">False</span>).reset_index()</span><br><span class="line">df_all_corr.rename(columns=&#123;<span class="string">&quot;level_0&quot;</span>: <span class="string">&quot;Feature 1&quot;</span>, <span class="string">&quot;level_1&quot;</span>: <span class="string">&quot;Feature 2&quot;</span>, <span class="number">0</span>: <span class="string">&#x27;Correlation Coefficient&#x27;</span>&#125;, inplace=<span class="literal">True</span>)</span><br><span class="line">print(df_all_corr[df_all_corr[<span class="string">&quot;Feature 1&quot;</span>] == <span class="string">&quot;Age&quot;</span>])</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/04.png"><br>为了更加精确，用性别作为填充年龄缺失值时的第二级分类。当乘客等级上升时，无论男女其年龄中位数都上升。而女性的年龄中位数稍微低于男性。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 按等级和性别分组计算年龄中位数</span></span><br><span class="line">    age_by_pclass_sex = df_all.groupby([<span class="string">&quot;Sex&quot;</span>, <span class="string">&quot;Pclass&quot;</span>]).median()[<span class="string">&quot;Age&quot;</span>]</span><br><span class="line">   </span><br><span class="line">    <span class="keyword">for</span> pclass <span class="keyword">in</span> <span class="built_in">range</span>(<span class="number">1</span>, <span class="number">4</span>):</span><br><span class="line">        <span class="keyword">for</span> sex <span class="keyword">in</span> [<span class="string">&quot;female&quot;</span>, <span class="string">&quot;male&quot;</span>]:</span><br><span class="line">            print(<span class="string">&quot;分等级乘客年龄中位数为&#123;&#125; &#123;&#125;s: &#123;&#125;&quot;</span>.<span class="built_in">format</span>(pclass, sex, age_by_pclass_sex[sex][pclass]))</span><br><span class="line">    print(<span class="string">&quot;所有乘客的年龄中位数为: &#123;&#125;&quot;</span>.<span class="built_in">format</span>(df_all[<span class="string">&quot;Age&quot;</span>].median()))</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/05.png"></p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 用各组的年龄中位数填充缺失值</span></span><br><span class="line">df_all[<span class="string">&quot;Age&quot;</span>] = df_all.groupby([<span class="string">&quot;Sex&quot;</span>, <span class="string">&quot;Pclass&quot;</span>])[<span class="string">&quot;Age&quot;</span>].apply(<span class="keyword">lambda</span> x : x.fillna(x.median()))</span><br><span class="line">print(df_all[<span class="string">&quot;Age&quot;</span>].isnull().<span class="built_in">sum</span>())</span><br></pre></td></tr></table></figure>
<p>处理登船地点缺失值<br>仅有两个缺失值</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 查看Embarked缺失值信息</span></span><br><span class="line">print(df_all[df_all[<span class="string">&quot;Embarked&quot;</span>].isnull()])</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/06.png"><br>都是女性，等级票号都一样，说明她们彼此认识。跟她们一样等级的女性的典型登船地点是C，但不意味着她们也如此。谷歌二位的名字，发现二者是主仆关系，从S登船。于是用S填充。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line">    <span class="comment"># 根据搜索的真实值用S填充Embarked</span></span><br><span class="line">    df_all[<span class="string">&quot;Embarked&quot;</span>] = df_all[<span class="string">&quot;Embarked&quot;</span>].fillna(<span class="string">&#x27;S&#x27;</span>)</span><br><span class="line">处理票价缺失值</span><br><span class="line">    <span class="comment"># 处理Fare缺失值</span></span><br><span class="line">    <span class="comment"># 输出缺失值情况</span></span><br><span class="line">    print(df_all[df_all[<span class="string">&quot;Fare&quot;</span>].isnull()])</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/07.png"><br>只有一个缺失值，可以用与其相同的等级和家庭人数的男性乘客的票价的中位数来填充。<br>最后处理舱位数据的缺失值<br>舱位的数据处理比较棘手，因为缺失数据较多，而且其与生存率有关系，不能完全抛弃。<br>Cabin数据的第一个字母代表了舱位的位置，舱位大部分只供某一等级的乘客使用，但也有部分舱位是多个等级混合使用的。<br>如图<br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/08.png"><br>在船的甲板上有6个房间，被标为T、U、W、X、Y、Z。但只有T舱在数据集中出现。<br>A、B、C舱只有第一等级的乘客。<br>D、E舱各种等级的乘客都有。<br>F、G舱有二三等级的乘客。<br>从A舱到G舱，与楼梯的距离增加，这可能是影响生存率的一个因素。<br>下面画一下乘客的舱位分布图，M代表缺失值。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 计算每个等级的乘客在每个舱位的数量</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_pclass_dist</span>(<span class="params">df</span>):</span></span><br><span class="line">    deck_counts = &#123;<span class="string">&#x27;A&#x27;</span>: &#123;&#125;, <span class="string">&#x27;B&#x27;</span>: &#123;&#125;, <span class="string">&#x27;C&#x27;</span>: &#123;&#125;, <span class="string">&#x27;D&#x27;</span>: &#123;&#125;, <span class="string">&#x27;E&#x27;</span>: &#123;&#125;, <span class="string">&#x27;F&#x27;</span>: &#123;&#125;, <span class="string">&#x27;G&#x27;</span>: &#123;&#125;, <span class="string">&#x27;M&#x27;</span>: &#123;&#125;, <span class="string">&#x27;T&#x27;</span>: &#123;&#125;&#125;</span><br><span class="line">    decks = df.columns.levels[<span class="number">0</span>]</span><br><span class="line">   </span><br><span class="line">    <span class="keyword">for</span> deck <span class="keyword">in</span> decks:</span><br><span class="line">        <span class="keyword">for</span> pclass <span class="keyword">in</span> <span class="built_in">range</span>(<span class="number">1</span>, <span class="number">4</span>):</span><br><span class="line">            <span class="keyword">try</span>:</span><br><span class="line">                count = df[deck][pclass][<span class="number">0</span>]</span><br><span class="line">                deck_counts[deck][pclass] = count</span><br><span class="line">            <span class="keyword">except</span> KeyError:</span><br><span class="line">                deck_counts[deck][pclass] = <span class="number">0</span></span><br><span class="line">    df_decks = pd.DataFrame(deck_counts)</span><br><span class="line">    deck_percentages = &#123;&#125;</span><br><span class="line">    </span><br><span class="line">    <span class="comment"># 计算每个乘客等级在每个客舱的比例</span></span><br><span class="line">    <span class="keyword">for</span> col <span class="keyword">in</span> df_decks.columns:</span><br><span class="line">        deck_percentages[col] = [(count/df_decks[col].<span class="built_in">sum</span>()) * <span class="number">100</span> <span class="keyword">for</span> count <span class="keyword">in</span> df_decks[col]]</span><br><span class="line">        </span><br><span class="line">    <span class="keyword">return</span> deck_counts, deck_percentages</span><br><span class="line">    </span><br><span class="line"></span><br><span class="line"><span class="comment"># 绘图显示等级舱位距离</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">display_pclass_dist</span>(<span class="params">percentages</span>):</span></span><br><span class="line">    df_percentages = pd.DataFrame(percentages).transpose()</span><br><span class="line">    deck_names = (<span class="string">&#x27;A&#x27;</span>, <span class="string">&#x27;B&#x27;</span>, <span class="string">&#x27;C&#x27;</span>, <span class="string">&#x27;D&#x27;</span>, <span class="string">&#x27;E&#x27;</span>, <span class="string">&#x27;F&#x27;</span>, <span class="string">&#x27;G&#x27;</span>, <span class="string">&#x27;M&#x27;</span>, <span class="string">&#x27;T&#x27;</span>)</span><br><span class="line">    bar_count = np.arange(<span class="built_in">len</span>(deck_names))</span><br><span class="line">    bar_width = <span class="number">0.85</span></span><br><span class="line">   </span><br><span class="line">    pclass1 = df_percentages[<span class="number">0</span>]</span><br><span class="line">    pclass2 = df_percentages[<span class="number">1</span>]</span><br><span class="line">    pclass3 = df_percentages[<span class="number">2</span>]</span><br><span class="line">   </span><br><span class="line">    plt.figure(figsize = (<span class="number">20</span>, <span class="number">10</span>))</span><br><span class="line">    plt.bar(bar_count, pclass1, color = <span class="string">&#x27;#b5ffb9&#x27;</span>, edgecolor = <span class="string">&quot;white&quot;</span>,width = bar_width, label = <span class="string">&quot;Passenger Class 1&quot;</span>)</span><br><span class="line">    plt.bar(bar_count, pclass2, bottom = pclass1, color = <span class="string">&quot;#f9bc86&quot;</span>, edgecolor = <span class="string">&quot;white&quot;</span>,width = bar_width, label = <span class="string">&quot;Passenger Class 2&quot;</span>)</span><br><span class="line">    plt.bar(bar_count, pclass3, bottom = pclass1 + pclass2, color = <span class="string">&quot;#a3acff&quot;</span>, edgecolor = <span class="string">&quot;white&quot;</span>,width = bar_width, label = <span class="string">&quot;Passenger Class 3&quot;</span>)</span><br><span class="line">   </span><br><span class="line">    plt.xlabel(<span class="string">&quot;Deck&quot;</span>, size = <span class="number">15</span>, labelpad = <span class="number">20</span>)</span><br><span class="line">    plt.xlabel(<span class="string">&quot;Passenger Class Percentage&quot;</span>, size = <span class="number">15</span>, labelpad = <span class="number">20</span>)</span><br><span class="line">    plt.xticks(bar_count, deck_names)</span><br><span class="line">    plt.tick_params(axis = <span class="string">&quot;x&quot;</span>, labelsize = <span class="number">15</span>)</span><br><span class="line">    plt.tick_params(axis = <span class="string">&quot;y&quot;</span>, labelsize = <span class="number">15</span>)</span><br><span class="line">    plt.legend(loc=<span class="string">&quot;upper left&quot;</span>,bbox_to_anchor=(<span class="number">1</span>, <span class="number">1</span>), prop=&#123;<span class="string">&#x27;size&#x27;</span>: <span class="number">15</span>&#125;)</span><br><span class="line">    plt.title(<span class="string">&quot;Passenger Class Distribution in Decks&quot;</span>, size=<span class="number">18</span>, y=<span class="number">1.05</span>)</span><br><span class="line">    plt.savefig(<span class="string">&quot;pclassdeck.png&quot;</span>)</span><br><span class="line"></span><br><span class="line"><span class="comment"># 画图看看每个客舱的乘客等级比例</span></span><br><span class="line">    all_deck_count, all_deck_per = get_pclass_dist(df_all_decks)</span><br><span class="line">    display_pclass_dist(all_deck_per)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/09.png"><br>越往后，低等级乘客的比例越大。<br>其中，A、B、C三个船舱全部乘客为等级1的乘客。船舱D的乘客87%为一等级，13%为二等级。船舱E83%为一等级，10%为二等级，7%为三等级。船舱F为62%二等级和38%三等级。G舱的全部是三等级的乘客。有一位乘客在T舱，T舱接近A舱，他是一等级的乘客，因此分组时被分到A舱。舱位数据缺失的标记为”M”，由于不太可能得知这些乘客的真实舱位，因此把M也作为一个舱位。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 计算每个船舱的生存比例</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_survived_dist</span>(<span class="params">df</span>):</span></span><br><span class="line">    surv_counts = &#123;<span class="string">&#x27;A&#x27;</span>:&#123;&#125;, <span class="string">&#x27;B&#x27;</span>:&#123;&#125;, <span class="string">&#x27;C&#x27;</span>:&#123;&#125;, <span class="string">&#x27;D&#x27;</span>:&#123;&#125;, <span class="string">&#x27;E&#x27;</span>:&#123;&#125;, <span class="string">&#x27;F&#x27;</span>:&#123;&#125;, <span class="string">&#x27;G&#x27;</span>:&#123;&#125;, <span class="string">&#x27;M&#x27;</span>:&#123;&#125;&#125;</span><br><span class="line">    decks = df.columns.levels[<span class="number">0</span>]</span><br><span class="line">   </span><br><span class="line">    <span class="keyword">for</span> deck <span class="keyword">in</span> decks:</span><br><span class="line">        <span class="keyword">for</span> survive <span class="keyword">in</span> <span class="built_in">range</span>(<span class="number">0</span>, <span class="number">2</span>):</span><br><span class="line">            surv_counts[deck][survive] = df[deck][survive][<span class="number">0</span>]</span><br><span class="line">   </span><br><span class="line">    df_surv = pd.DataFrame(surv_counts)</span><br><span class="line">    surv_percentages = &#123;&#125;</span><br><span class="line">   </span><br><span class="line">    <span class="keyword">for</span> col <span class="keyword">in</span> df_surv.columns:</span><br><span class="line">        surv_percentages[col] = [(count / df_surv[col].<span class="built_in">sum</span>()) * <span class="number">100</span> <span class="keyword">for</span> count <span class="keyword">in</span> df_surv[col]]</span><br><span class="line">       </span><br><span class="line">    <span class="keyword">return</span> surv_counts, surv_percentages</span><br><span class="line">   </span><br><span class="line">   </span><br><span class="line"><span class="comment"># 绘制每个船舱乘客生存率图</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">display_surv_dist</span>(<span class="params">percentages</span>):</span></span><br><span class="line">    df_survived_percentages = pd.DataFrame(percentages).transpose()</span><br><span class="line">    deck_names = (<span class="string">&#x27;A&#x27;</span>, <span class="string">&#x27;B&#x27;</span>, <span class="string">&#x27;C&#x27;</span>, <span class="string">&#x27;D&#x27;</span>, <span class="string">&#x27;E&#x27;</span>, <span class="string">&#x27;F&#x27;</span>, <span class="string">&#x27;G&#x27;</span>, <span class="string">&#x27;M&#x27;</span>)</span><br><span class="line">    bar_count = np.arange(<span class="built_in">len</span>(deck_names))</span><br><span class="line">    bar_width = <span class="number">0.85</span></span><br><span class="line">   </span><br><span class="line">    not_survived = df_survived_percentages[<span class="number">0</span>]</span><br><span class="line">    survived = df_survived_percentages[<span class="number">1</span>]</span><br><span class="line">   </span><br><span class="line">    plt.figure(figsize=(<span class="number">20</span>, <span class="number">10</span>))</span><br><span class="line">    plt.bar(bar_count, not_survived, color=<span class="string">&#x27;#b5ffb9&#x27;</span>, edgecolor=<span class="string">&#x27;white&#x27;</span>, width=bar_width, label=<span class="string">&quot;Not Survived&quot;</span>)</span><br><span class="line">    plt.bar(bar_count, survived, bottom=not_survived, color=<span class="string">&#x27;#f9bc86&#x27;</span>, edgecolor=<span class="string">&#x27;white&#x27;</span>, width=bar_width, label=<span class="string">&quot;Survived&quot;</span>)</span><br><span class="line">    plt.xlabel(<span class="string">&#x27;Deck&#x27;</span>, size=<span class="number">15</span>, labelpad=<span class="number">20</span>)</span><br><span class="line">    plt.ylabel(<span class="string">&#x27;Survival Percentage&#x27;</span>, size=<span class="number">15</span>, labelpad=<span class="number">20</span>)</span><br><span class="line">    plt.xticks(bar_count, deck_names)   </span><br><span class="line">    plt.tick_params(axis=<span class="string">&#x27;x&#x27;</span>, labelsize=<span class="number">15</span>)</span><br><span class="line">    plt.tick_params(axis=<span class="string">&#x27;y&#x27;</span>, labelsize=<span class="number">15</span>)</span><br><span class="line">   </span><br><span class="line">    plt.legend(loc=<span class="string">&#x27;upper left&#x27;</span>, bbox_to_anchor=(<span class="number">1</span>, <span class="number">1</span>), prop=&#123;<span class="string">&#x27;size&#x27;</span>: <span class="number">15</span>&#125;)</span><br><span class="line">    plt.title(<span class="string">&#x27;Survival Percentage in Decks&#x27;</span>, size=<span class="number">18</span>, y=<span class="number">1.05</span>)</span><br><span class="line">   </span><br><span class="line">    plt.savefig(<span class="string">&quot;CabinSurvived.png&quot;</span>)</span><br><span class="line"></span><br><span class="line">    <span class="comment"># 计算每个客舱的乘客生存率，绘图</span></span><br><span class="line">    df_all_decks_survived = df_all.groupby([<span class="string">&#x27;Deck&#x27;</span>, <span class="string">&#x27;Survived&#x27;</span>]).count().drop(columns=[<span class="string">&#x27;Sex&#x27;</span>, <span class="string">&#x27;Age&#x27;</span>, <span class="string">&#x27;SibSp&#x27;</span>, <span class="string">&#x27;Parch&#x27;</span>, <span class="string">&#x27;Fare&#x27;</span>, <span class="string">&#x27;Embarked&#x27;</span>, <span class="string">&#x27;Pclass&#x27;</span>, <span class="string">&#x27;Cabin&#x27;</span>, <span class="string">&#x27;PassengerId&#x27;</span>, <span class="string">&#x27;Ticket&#x27;</span>]).rename(columns=&#123;<span class="string">&#x27;Name&#x27;</span>:<span class="string">&#x27;Count&#x27;</span>&#125;).transpose()</span><br><span class="line">    all_surv_count, all_surv_per = get_survived_dist(df_all_decks_survived)</span><br><span class="line">    display_surv_dist(all_surv_per)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/10.png"><br>每个船舱的乘客生存率不一样，B、C、D、E舱的生存率最高。乘客多为一等级的。M舱的生存率最低，其乘客多为二三等级的。<br>因此可将数据分为ABC(只有一等乘客)，DE，FG(乘客组成类似)，M组。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 将客舱数据按组成比例分组</span></span><br><span class="line">df_all[<span class="string">&quot;Deck&quot;</span>] = df_all[<span class="string">&quot;Deck&quot;</span>].replace([<span class="string">&#x27;A&#x27;</span>, <span class="string">&#x27;B&#x27;</span>, <span class="string">&#x27;C&#x27;</span>], <span class="string">&#x27;ABC&#x27;</span>)</span><br><span class="line">df_all[<span class="string">&quot;Deck&quot;</span>] = df_all[<span class="string">&quot;Deck&quot;</span>].replace([<span class="string">&#x27;D&#x27;</span>, <span class="string">&#x27;E&#x27;</span>], <span class="string">&#x27;DE&#x27;</span>)</span><br><span class="line">df_all[<span class="string">&quot;Deck&quot;</span>] = df_all[<span class="string">&quot;Deck&quot;</span>].replace([<span class="string">&#x27;F&#x27;</span>, <span class="string">&#x27;G&#x27;</span>], <span class="string">&#x27;FG&#x27;</span>)</span><br><span class="line">print(df_all[<span class="string">&quot;Deck&quot;</span>].value_counts())</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/11.png"><br>现在，我们已经用Deck特征代替了Cabin，可以将Cabin丢弃了。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 划分训练集和测试集</span></span><br><span class="line">df_train, df_test = divide_df(df_all)</span><br><span class="line">dfs = [df_train, df_test]</span><br><span class="line"></span><br><span class="line"><span class="keyword">for</span> df <span class="keyword">in</span> dfs:</span><br><span class="line">    print(df.info())</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/12.png"><br>搞定，没有缺失值了。<br>再来看看目标值的分布，在训练集中获救率38.38%(342/891)，死亡率61.62%(549/891)。<br>画图看看。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 目标值的分布</span></span><br><span class="line">    survived = df_train[<span class="string">&quot;Survived&quot;</span>].value_counts()[<span class="number">1</span>]</span><br><span class="line">    not_survived = df_train[<span class="string">&quot;Survived&quot;</span>].value_counts()[<span class="number">0</span>]</span><br><span class="line">    survived_per = survived / df_train.shape[<span class="number">0</span>] * <span class="number">100</span></span><br><span class="line">    not_survived_per = not_survived / df_train.shape[<span class="number">0</span>] * <span class="number">100</span></span><br><span class="line">    print(<span class="string">&#x27;&#123;&#125;名乘客中的&#123;&#125;名获救，占训练集的&#123;:.2f&#125;%。&#x27;</span>.<span class="built_in">format</span>(df_train.shape[<span class="number">0</span>], survived, survived_per))</span><br><span class="line">    print(<span class="string">&#x27;&#123;&#125;名乘客中的&#123;&#125;名遇难，占训练集的&#123;:.2f&#125;%。&#x27;</span>.<span class="built_in">format</span>(df_train.shape[<span class="number">0</span>], not_survived, not_survived_per))</span><br><span class="line">    </span><br><span class="line">    plt.figure(figsize=(<span class="number">10</span>, <span class="number">8</span>))</span><br><span class="line">    sns.countplot(df_train[<span class="string">&quot;Survived&quot;</span>])</span><br><span class="line">    plt.xlabel(<span class="string">&quot;Survival&quot;</span>, size = <span class="number">15</span>, labelpad = <span class="number">15</span>)</span><br><span class="line">    plt.ylabel(<span class="string">&#x27;Passenger Count&#x27;</span>, size=<span class="number">15</span>, labelpad=<span class="number">15</span>)</span><br><span class="line">    plt.xticks((<span class="number">0</span>, <span class="number">1</span>), [<span class="string">&#x27;Not Survived (&#123;0:.2f&#125;%)&#x27;</span>.<span class="built_in">format</span>(not_survived_per), <span class="string">&#x27;Survived (&#123;0:.2f&#125;%)&#x27;</span>.<span class="built_in">format</span>(survived_per)])</span><br><span class="line">    plt.tick_params(axis=<span class="string">&#x27;x&#x27;</span>, labelsize=<span class="number">13</span>)</span><br><span class="line">    plt.tick_params(axis=<span class="string">&#x27;y&#x27;</span>, labelsize=<span class="number">13</span>)</span><br><span class="line">    </span><br><span class="line">    plt.title(<span class="string">&quot;Training Set Survival Distribution&quot;</span>)</span><br><span class="line">    plt.savefig(<span class="string">&quot;surviveddist.png&quot;</span>)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/13.png"><br>分析特征之间的相关性。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 分析特征间的相关性</span></span><br><span class="line">df_train_corr = df_train.drop([<span class="string">&quot;PassengerId&quot;</span>], axis = <span class="number">1</span>).corr().<span class="built_in">abs</span>().unstack().sort_values(kind = <span class="string">&quot;quicksort&quot;</span>, ascending = <span class="literal">False</span>).reset_index()</span><br><span class="line">df_train_corr.rename(columns = &#123;<span class="string">&quot;level_0&quot;</span>: <span class="string">&quot;Feature 1&quot;</span>,  <span class="string">&quot;level_1&quot;</span>: <span class="string">&quot;Feature 2&quot;</span>, <span class="number">0</span>: <span class="string">&quot;Correlation Coefficient&quot;</span>&#125;, inplace = <span class="literal">True</span>)</span><br><span class="line">df_train_corr.drop(df_train_corr.iloc[<span class="number">1</span>::<span class="number">2</span>].index, inplace = <span class="literal">True</span>)</span><br><span class="line">df_train_corr_nd = df_train_corr.drop(df_train_corr[df_train_corr[<span class="string">&quot;Correlation Coefficient&quot;</span>] == <span class="number">1.0</span>].index)</span><br><span class="line">   </span><br><span class="line">df_test_corr = df_test.drop([<span class="string">&quot;PassengerId&quot;</span>], axis = <span class="number">1</span>).corr().<span class="built_in">abs</span>().unstack().sort_values(kind = <span class="string">&quot;quicksort&quot;</span>, ascending = <span class="literal">False</span>).reset_index()</span><br><span class="line">df_test_corr.rename(columns = &#123;<span class="string">&quot;level_0&quot;</span>: <span class="string">&quot;Feature 1&quot;</span>,  <span class="string">&quot;level_1&quot;</span>: <span class="string">&quot;Feature 2&quot;</span>, <span class="number">0</span>: <span class="string">&quot;Correlation Coefficient&quot;</span>&#125;, inplace = <span class="literal">True</span>)</span><br><span class="line">df_test_corr.drop(df_test_corr.iloc[<span class="number">1</span>::<span class="number">2</span>].index, inplace = <span class="literal">True</span>)</span><br><span class="line">df_test_corr_nd = df_test_corr.drop(df_test_corr[df_test_corr[<span class="string">&quot;Correlation Coefficient&quot;</span>] == <span class="number">1.0</span>].index)</span><br><span class="line">   </span><br><span class="line"><span class="comment"># 训练集的高相关性</span></span><br><span class="line">corr = df_train_corr_nd[<span class="string">&quot;Correlation Coefficient&quot;</span>] &gt; <span class="number">0.1</span></span><br><span class="line">print(df_train_corr_nd[corr])</span><br><span class="line"><span class="comment"># 测试集的高相关性</span></span><br><span class="line">corr = df_test_corr_nd[<span class="string">&quot;Correlation Coefficient&quot;</span>] &gt; <span class="number">0.1</span></span><br><span class="line">print(df_test_corr_nd[corr])</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/14.png"><br>训练集和测试集都是船票价格和等级相关性最大。<br>画图看看。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 绘相关性图</span></span><br><span class="line">fig = plt.figure(figsize = (<span class="number">20</span>, <span class="number">20</span>))</span><br><span class="line">sns.heatmap(df_train.drop([<span class="string">&quot;PassengerId&quot;</span>], axis = <span class="number">1</span>).corr(), annot = <span class="literal">True</span>, square = <span class="literal">True</span>, cmap = <span class="string">&quot;coolwarm&quot;</span>, annot_kws = &#123;<span class="string">&quot;size&quot;</span> : <span class="number">14</span>&#125;)</span><br><span class="line">plt.tick_params(axis = <span class="string">&quot;x&quot;</span>, labelsize = <span class="number">14</span>)</span><br><span class="line">plt.title(<span class="string">&quot;Training Set Correlations&quot;</span>, size = <span class="number">15</span>)</span><br><span class="line">plt.savefig(<span class="string">&quot;TrainFeatureCorr.png&quot;</span>)</span><br><span class="line"></span><br><span class="line">fig = plt.figure(figsize = (<span class="number">20</span>, <span class="number">20</span>))</span><br><span class="line">sns.heatmap(df_test.drop([<span class="string">&quot;PassengerId&quot;</span>], axis = <span class="number">1</span>).corr(), annot = <span class="literal">True</span>, square = <span class="literal">True</span>, cmap = <span class="string">&quot;coolwarm&quot;</span>, annot_kws = &#123;<span class="string">&quot;size&quot;</span> : <span class="number">14</span>&#125;)</span><br><span class="line">plt.tick_params(axis = <span class="string">&quot;y&quot;</span>, labelsize = <span class="number">14</span>)</span><br><span class="line">plt.title(<span class="string">&quot;Testing Set Correlations&quot;</span>, size = <span class="number">15</span>)</span><br><span class="line">plt.savefig(<span class="string">&quot;TestFeatureCorr.png&quot;</span>)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/15.png"><br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/16.png"><br>目标值在特征中的分布<br>先看连续型特征，有两个，年龄和票价。它们有良好的分割点用于构建决策树。一个潜在的问题是它们在训练集中的分布与在测试集中不同，后者更平滑。<br>画图看看。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 研究连续型特征的分布</span></span><br><span class="line">cont_features = [<span class="string">&quot;Age&quot;</span>, <span class="string">&quot;Fare&quot;</span>]</span><br><span class="line">surv = df_train[<span class="string">&quot;Survived&quot;</span>] == <span class="number">1</span></span><br><span class="line">fig, axs = plt.subplots(ncols = <span class="number">1</span>, nrows = <span class="number">6</span>, figsize = (<span class="number">15</span>, <span class="number">15</span>))</span><br><span class="line">plt.subplots_adjust(right = <span class="number">1.5</span>)</span><br><span class="line">   </span><br><span class="line"><span class="comment"># 特征中的获救人数分布</span></span><br><span class="line">sns.distplot(df_train[~surv][<span class="string">&quot;Age&quot;</span>], label = <span class="string">&quot;Not Survived&quot;</span>, hist = <span class="literal">True</span>, color=<span class="string">&#x27;#e74c3c&#x27;</span>, ax=axs[<span class="number">0</span>])</span><br><span class="line">axs[<span class="number">0</span>].set_title(<span class="string">&quot;Age_Survived dist&quot;</span>)</span><br><span class="line">sns.distplot(df_train[surv][<span class="string">&quot;Fare&quot;</span>], label=<span class="string">&#x27;Survived&#x27;</span>, hist=<span class="literal">True</span>, color=<span class="string">&#x27;#2ecc71&#x27;</span>, ax=axs[<span class="number">1</span>])</span><br><span class="line">axs[<span class="number">1</span>].set_title(<span class="string">&quot;Fare_Survived dist&quot;</span>)</span><br><span class="line"><span class="comment"># 数据集中的获救人数分布</span></span><br><span class="line">sns.distplot(df_train[<span class="string">&quot;Age&quot;</span>], label=<span class="string">&#x27;Training Set&#x27;</span>, hist=<span class="literal">False</span>, color=<span class="string">&#x27;#e74c3c&#x27;</span>, ax=axs[<span class="number">2</span>])</span><br><span class="line">axs[<span class="number">2</span>].set_title(<span class="string">&quot;TrainSetAge_Survived dist&quot;</span>)</span><br><span class="line">sns.distplot(df_test[<span class="string">&quot;Age&quot;</span>], label=<span class="string">&#x27;Test Set&#x27;</span>, hist=<span class="literal">False</span>, color=<span class="string">&#x27;#2ecc71&#x27;</span>, ax=axs[<span class="number">3</span>])</span><br><span class="line">axs[<span class="number">3</span>].set_title(<span class="string">&quot;TestSetAge_Survived dist&quot;</span>)</span><br><span class="line">   </span><br><span class="line">sns.distplot(df_train[<span class="string">&quot;Fare&quot;</span>], label=<span class="string">&#x27;Training Set&#x27;</span>, hist=<span class="literal">False</span>, color=<span class="string">&#x27;#e74c3c&#x27;</span>, ax=axs[<span class="number">4</span>])</span><br><span class="line">axs[<span class="number">4</span>].set_title(<span class="string">&quot;TrainSetFare_Survived dist&quot;</span>)</span><br><span class="line">sns.distplot(df_test[<span class="string">&quot;Fare&quot;</span>], label=<span class="string">&#x27;Test Set&#x27;</span>, hist=<span class="literal">False</span>, color=<span class="string">&#x27;#2ecc71&#x27;</span>, ax=axs[<span class="number">5</span>])</span><br><span class="line">axs[<span class="number">5</span>].set_title(<span class="string">&quot;TestSetFare_Survived dist&quot;</span>)</span><br><span class="line"></span><br><span class="line">plt.savefig(<span class="string">&quot;feature_dist.png&quot;</span>)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/17.png"><br>年龄特征的获救率分布显示低于15岁组的获救率高于其它组，船票特征的获救率分布显示在分布的末端有较高的获救率。<br>再来研究分类变量。<br>每个分类变量至少有一个类别有较高的生存率，这对预测很有帮助。其中Pclass和Sex是最好的分类特征，因为其分布的匀质性。<br>还是画图看看</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 研究分类特征</span></span><br><span class="line">cat_features = [<span class="string">&#x27;Embarked&#x27;</span>, <span class="string">&#x27;Parch&#x27;</span>, <span class="string">&#x27;Pclass&#x27;</span>, <span class="string">&#x27;Sex&#x27;</span>, <span class="string">&#x27;SibSp&#x27;</span>, <span class="string">&#x27;Deck&#x27;</span>]</span><br><span class="line"></span><br><span class="line">fig, axs = plt.subplots(ncols=<span class="number">2</span>, nrows=<span class="number">3</span>, figsize=(<span class="number">20</span>, <span class="number">20</span>))</span><br><span class="line">plt.subplots_adjust(right=<span class="number">1.5</span>, top=<span class="number">1.25</span>)</span><br><span class="line"></span><br><span class="line"><span class="keyword">for</span> i, feature <span class="keyword">in</span> <span class="built_in">enumerate</span>(cat_features, <span class="number">1</span>):</span><br><span class="line">    plt.subplot(<span class="number">2</span>, <span class="number">3</span>, i)</span><br><span class="line">    sns.countplot(x=feature, hue=<span class="string">&#x27;Survived&#x27;</span>, data=df_train)</span><br><span class="line">    </span><br><span class="line">    plt.xlabel(<span class="string">&#x27;&#123;&#125;&#x27;</span>.<span class="built_in">format</span>(feature), size=<span class="number">20</span>, labelpad=<span class="number">15</span>)</span><br><span class="line">    plt.ylabel(<span class="string">&#x27;Passenger Count&#x27;</span>, size=<span class="number">20</span>, labelpad=<span class="number">15</span>)    </span><br><span class="line">    plt.tick_params(axis=<span class="string">&#x27;x&#x27;</span>, labelsize=<span class="number">20</span>)</span><br><span class="line">    plt.tick_params(axis=<span class="string">&#x27;y&#x27;</span>, labelsize=<span class="number">20</span>)</span><br><span class="line">    </span><br><span class="line">    plt.legend([<span class="string">&#x27;Not Survived&#x27;</span>, <span class="string">&#x27;Survived&#x27;</span>], loc=<span class="string">&#x27;upper center&#x27;</span>, prop=&#123;<span class="string">&#x27;size&#x27;</span>: <span class="number">18</span>&#125;)</span><br><span class="line">    plt.title(<span class="string">&#x27;Count of Survival in &#123;&#125; Feature&#x27;</span>.<span class="built_in">format</span>(feature), size=<span class="number">20</span>, y=<span class="number">1.05</span>)</span><br><span class="line">    </span><br><span class="line">plt.savefig(<span class="string">&quot;cat_feature_dist.png&quot;</span>)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/18.png"><br>从S港口登船的乘客获救率最低，从C港登船的乘客有超过半数获救。有一个家庭成员的乘客获救率最高。<br>分析数据的结论:特征之间相关性很高，因此可以通过特征转换产生新的特征。连续型特征可以通过决策树模型进行划分。分类变量的不同分类之间的生存率有很大差异。这些特征可以进行one-hot编码。一些特征可以彼此联合形成新的特征。在数据探索阶段生成了一个新的特征”Deck”并放弃了”Cabin”特征。<br>重新将数据合并为一个。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 重新将数据合并</span></span><br><span class="line">df_all = concat_df(df_train, df_test)</span><br><span class="line">print(df_all.head())</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/19.png"><br>数据清洗探索阶段就完了，接下来进行特征工程。<br>3.特征工程<br>连续特征<br>票价<br>将票价分为13个组，画图看看。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 将Fare划分为13组，画图</span></span><br><span class="line">df_all[<span class="string">&quot;Fare&quot;</span>] = pd.qcut(df_all[<span class="string">&quot;Fare&quot;</span>], <span class="number">13</span>)</span><br><span class="line"><span class="comment"># 绘图</span></span><br><span class="line">fig, axs = plt.subplots(figsize=(<span class="number">22</span>, <span class="number">9</span>))</span><br><span class="line">sns.countplot(x = <span class="string">&quot;Fare&quot;</span>, hue = <span class="string">&quot;Survived&quot;</span>, data = df_all)</span><br><span class="line">plt.xlabel(<span class="string">&#x27;Fare&#x27;</span>, size=<span class="number">15</span>, labelpad=<span class="number">20</span>)</span><br><span class="line">plt.ylabel(<span class="string">&#x27;Passenger Count&#x27;</span>, size=<span class="number">15</span>, labelpad=<span class="number">20</span>)</span><br><span class="line">plt.tick_params(axis=<span class="string">&#x27;x&#x27;</span>, labelsize=<span class="number">10</span>)</span><br><span class="line">plt.tick_params(axis=<span class="string">&#x27;y&#x27;</span>, labelsize=<span class="number">15</span>)</span><br><span class="line"></span><br><span class="line">plt.legend([<span class="string">&#x27;Not Survived&#x27;</span>, <span class="string">&#x27;Survived&#x27;</span>], loc=<span class="string">&#x27;upper right&#x27;</span>, prop=&#123;<span class="string">&#x27;size&#x27;</span>: <span class="number">15</span>&#125;)</span><br><span class="line">plt.title(<span class="string">&#x27;Count of Survival in &#123;&#125; Feature&#x27;</span>.<span class="built_in">format</span>(<span class="string">&#x27;Fare&#x27;</span>), size=<span class="number">15</span>, y=<span class="number">1.05</span>)</span><br><span class="line"></span><br><span class="line">plt.savefig(<span class="string">&quot;FE_fare.png&quot;</span>)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/20.png"><br>左侧的生还率最低，右侧的生还率最高。<br>中间有一组(15.742, 23.25]有异常。<br>年龄，年龄符合正态分布，将年龄分为十组，第一组有最高的获救率，第四组的获救率最低。有一个异常组(34.0, 40.0]的获救率偏高。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre></td><td class="code"><pre><span class="line"> <span class="comment"># 将Age划分为10组，画图</span></span><br><span class="line">df_all[<span class="string">&quot;Age&quot;</span>] = pd.qcut(df_all[<span class="string">&quot;Age&quot;</span>], <span class="number">10</span>)</span><br><span class="line"><span class="comment"># 绘图</span></span><br><span class="line">fig, axs = plt.subplots(figsize=(<span class="number">22</span>, <span class="number">9</span>))</span><br><span class="line">sns.countplot(x = <span class="string">&quot;Age&quot;</span>, hue = <span class="string">&quot;Survived&quot;</span>, data = df_all)</span><br><span class="line">plt.xlabel(<span class="string">&#x27;Age&#x27;</span>, size=<span class="number">15</span>, labelpad=<span class="number">20</span>)</span><br><span class="line">plt.ylabel(<span class="string">&#x27;Passenger Count&#x27;</span>, size=<span class="number">15</span>, labelpad=<span class="number">20</span>)</span><br><span class="line">plt.tick_params(axis=<span class="string">&#x27;x&#x27;</span>, labelsize=<span class="number">15</span>)</span><br><span class="line">plt.tick_params(axis=<span class="string">&#x27;y&#x27;</span>, labelsize=<span class="number">15</span>)</span><br><span class="line"></span><br><span class="line">plt.legend([<span class="string">&#x27;Not Survived&#x27;</span>, <span class="string">&#x27;Survived&#x27;</span>], loc=<span class="string">&#x27;upper right&#x27;</span>, prop=&#123;<span class="string">&#x27;size&#x27;</span>: <span class="number">15</span>&#125;)</span><br><span class="line">plt.title(<span class="string">&#x27;Count of Survival in &#123;&#125; Feature&#x27;</span>.<span class="built_in">format</span>(<span class="string">&#x27;Age&#x27;</span>), size=<span class="number">15</span>, y=<span class="number">1.05</span>)</span><br><span class="line"></span><br><span class="line">plt.savefig(<span class="string">&quot;FE_age.png&quot;</span>)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/21.png"><br>频率编码<br>建立Family_Size特征，通过将SibSp+Parch+1来得到。SibSp是兄弟姐妹数量，Parch是父母子女的数量，相加即为乘客在船上的亲人的总数。最后加1，是乘客本人。值为1标为”Alone”，值为2，3，4标为”Small”，值为5，6标为”Medium”，值为7，8，11标为”Large”。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 创建Family_Size特征 画图分析</span></span><br><span class="line">df_all[<span class="string">&quot;Family_Size&quot;</span>] = df_all[<span class="string">&quot;SibSp&quot;</span>] + df_all[<span class="string">&quot;Parch&quot;</span>] + <span class="number">1</span></span><br><span class="line">fig, axs = plt.subplots(figsize=(<span class="number">10</span>, <span class="number">10</span>), ncols=<span class="number">2</span>, nrows=<span class="number">2</span>)</span><br><span class="line"><span class="comment">#plt.subplots_adjust(right = 1.5)</span></span><br><span class="line">   </span><br><span class="line">sns.barplot(x=df_all[<span class="string">&#x27;Family_Size&#x27;</span>].value_counts().index, y=df_all[<span class="string">&#x27;Family_Size&#x27;</span>].value_counts().values, ax=axs[<span class="number">0</span>][<span class="number">0</span>])</span><br><span class="line">sns.countplot(x=<span class="string">&#x27;Family_Size&#x27;</span>, hue=<span class="string">&#x27;Survived&#x27;</span>, data=df_all, ax=axs[<span class="number">0</span>][<span class="number">1</span>])</span><br><span class="line">   </span><br><span class="line">axs[<span class="number">0</span>][<span class="number">0</span>].set_title(<span class="string">&#x27;Family Size Feature Value Counts&#x27;</span>, size=<span class="number">10</span>, y=<span class="number">1.05</span>)</span><br><span class="line">axs[<span class="number">0</span>][<span class="number">1</span>].set_title(<span class="string">&#x27;Survival Counts in Family Size &#x27;</span>, size=<span class="number">10</span>, y=<span class="number">1.05</span>)</span><br><span class="line">   </span><br><span class="line">family_map = &#123;<span class="number">1</span>: <span class="string">&#x27;Alone&#x27;</span>, <span class="number">2</span>: <span class="string">&#x27;Small&#x27;</span>, <span class="number">3</span>: <span class="string">&#x27;Small&#x27;</span>, <span class="number">4</span>: <span class="string">&#x27;Small&#x27;</span>, <span class="number">5</span>: <span class="string">&#x27;Medium&#x27;</span>, <span class="number">6</span>: <span class="string">&#x27;Medium&#x27;</span>, <span class="number">7</span>: <span class="string">&#x27;Large&#x27;</span>, <span class="number">8</span>: <span class="string">&#x27;Large&#x27;</span>, <span class="number">11</span>: <span class="string">&#x27;Large&#x27;</span>&#125;</span><br><span class="line">df_all[<span class="string">&#x27;Family_Size_Grouped&#x27;</span>] = df_all[<span class="string">&#x27;Family_Size&#x27;</span>].<span class="built_in">map</span>(family_map)</span><br><span class="line">   </span><br><span class="line">sns.barplot(x=df_all[<span class="string">&#x27;Family_Size_Grouped&#x27;</span>].value_counts().index,y=df_all[<span class="string">&#x27;Family_Size_Grouped&#x27;</span>].value_counts().values, ax=axs[<span class="number">1</span>][<span class="number">0</span>])</span><br><span class="line">sns.countplot(x=<span class="string">&#x27;Family_Size_Grouped&#x27;</span>, hue=<span class="string">&#x27;Survived&#x27;</span>, data=df_all, ax=axs[<span class="number">1</span>][<span class="number">1</span>])</span><br><span class="line">   </span><br><span class="line">axs[<span class="number">1</span>][<span class="number">0</span>].set_title(<span class="string">&#x27;Family Size Feature Value Counts After Grouping&#x27;</span>, size=<span class="number">10</span>, y=<span class="number">1.05</span>)</span><br><span class="line">axs[<span class="number">1</span>][<span class="number">1</span>].set_title(<span class="string">&#x27;Survival Counts in Family Size After Grouping&#x27;</span>, size=<span class="number">10</span>, y=<span class="number">1.05</span>)</span><br><span class="line">   </span><br><span class="line"><span class="keyword">for</span> i <span class="keyword">in</span> <span class="built_in">range</span>(<span class="number">2</span>):</span><br><span class="line">    axs[i][<span class="number">1</span>].legend([<span class="string">&#x27;Not Survived&#x27;</span>, <span class="string">&#x27;Survived&#x27;</span>], loc=<span class="string">&#x27;upper right&#x27;</span>, prop=&#123;<span class="string">&#x27;size&#x27;</span>: <span class="number">10</span>&#125;)</span><br><span class="line">    <span class="keyword">for</span> j <span class="keyword">in</span> <span class="built_in">range</span>(<span class="number">2</span>):</span><br><span class="line">        axs[i][j].tick_params(axis=<span class="string">&#x27;x&#x27;</span>, labelsize=<span class="number">10</span>)</span><br><span class="line">        axs[i][j].tick_params(axis=<span class="string">&#x27;y&#x27;</span>, labelsize=<span class="number">10</span>)</span><br><span class="line">        axs[i][j].set_xlabel(<span class="string">&#x27;&#x27;</span>)</span><br><span class="line">        axs[i][j].set_ylabel(<span class="string">&#x27;&#x27;</span>)</span><br><span class="line">plt.savefig(<span class="string">&quot;FE_family.png&quot;</span>)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/22.png"><br>有很多Ticket的异常值需要分析，因此将它们按照频率分组使事情更加简单。<br>很多人成群出行，比如朋友，主仆等，他们并不被划分为一家人，但是使用同样的Ticket。<br>按照有相同Ticket的人数分组，画图看看。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># Ticket_Frequency特征，绘图</span></span><br><span class="line">df_all[<span class="string">&quot;Ticket_Frequency&quot;</span>] = df_all.groupby(<span class="string">&quot;Ticket&quot;</span>)[<span class="string">&quot;Ticket&quot;</span>].transform(<span class="string">&quot;count&quot;</span>)</span><br><span class="line">fig, axs = plt.subplots(figsize = (<span class="number">12</span>, <span class="number">9</span>))</span><br><span class="line">sns.countplot(x = <span class="string">&quot;Ticket_Frequency&quot;</span>, hue = <span class="string">&quot;Survived&quot;</span>, data = df_all)</span><br><span class="line">plt.xlabel(<span class="string">&#x27;Ticket Frequency&#x27;</span>, size=<span class="number">15</span>, labelpad=<span class="number">20</span>)</span><br><span class="line">plt.ylabel(<span class="string">&#x27;Passenger Count&#x27;</span>, size=<span class="number">15</span>, labelpad=<span class="number">20</span>)</span><br><span class="line">plt.tick_params(axis=<span class="string">&#x27;x&#x27;</span>, labelsize=<span class="number">15</span>)</span><br><span class="line">plt.tick_params(axis=<span class="string">&#x27;y&#x27;</span>, labelsize=<span class="number">15</span>)</span><br><span class="line">   </span><br><span class="line">plt.legend([<span class="string">&#x27;Not Survived&#x27;</span>, <span class="string">&#x27;Survived&#x27;</span>], loc=<span class="string">&#x27;upper right&#x27;</span>, prop=&#123;<span class="string">&#x27;size&#x27;</span>: <span class="number">15</span>&#125;)</span><br><span class="line">plt.title(<span class="string">&#x27;Count of Survival in &#123;&#125; Feature&#x27;</span>.<span class="built_in">format</span>(<span class="string">&#x27;Ticket Frequency&#x27;</span>), size=<span class="number">15</span>, y=<span class="number">1.05</span>)</span><br><span class="line">   </span><br><span class="line">plt.savefig(<span class="string">&quot;FE_ticket.png&quot;</span>)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/23.png"><br>由图可知，有相同Ticket的人数为2，3，4时获救率最高，超过4时，获救率急剧下降。<br>称谓及婚否<br>称谓(Title)是根据姓名前缀新建的特征。姓名前有很多前缀，将Miss, Mrs, Ms, Mlle, Lady, Mme, the Countess, Dona替换为Miss/Mrs/Ms，Dr, Col, Major, Jonkheer, Capt, Sir, Don 和 Rev替换为Dr/Military/Noble/Clergy。<br>Is_Married是基于姓名Mrs前缀的二元特征。这是所有女性中获救率最高的特征。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 根据姓名前缀生成Title和Is_Married特征并分析。</span></span><br><span class="line">df_all[<span class="string">&quot;Title&quot;</span>] = df_all[<span class="string">&quot;Name&quot;</span>].<span class="built_in">str</span>.split(<span class="string">&#x27;, &#x27;</span>, expand = <span class="literal">True</span>)[<span class="number">1</span>].<span class="built_in">str</span>.split(<span class="string">&#x27;.&#x27;</span>, expand = <span class="literal">True</span>)[<span class="number">0</span>]</span><br><span class="line">df_all[<span class="string">&quot;Is_Married&quot;</span>] = <span class="number">0</span></span><br><span class="line">df_all[<span class="string">&quot;Is_Married&quot;</span>].loc[df_all[<span class="string">&quot;Title&quot;</span>] == <span class="string">&quot;Mrs&quot;</span>] = <span class="number">1</span></span><br><span class="line">   </span><br><span class="line">fig, axs = plt.subplots(nrows=<span class="number">2</span>, figsize=(<span class="number">20</span>, <span class="number">20</span>))</span><br><span class="line">sns.barplot(x=df_all[<span class="string">&#x27;Title&#x27;</span>].value_counts().index, y=df_all[<span class="string">&#x27;Title&#x27;</span>].value_counts().values, ax=axs[<span class="number">0</span>])</span><br><span class="line">   </span><br><span class="line">axs[<span class="number">0</span>].tick_params(axis=<span class="string">&#x27;x&#x27;</span>, labelsize=<span class="number">10</span>)</span><br><span class="line">axs[<span class="number">1</span>].tick_params(axis=<span class="string">&#x27;x&#x27;</span>, labelsize=<span class="number">15</span>)</span><br><span class="line">   </span><br><span class="line"><span class="keyword">for</span> i <span class="keyword">in</span> <span class="built_in">range</span>(<span class="number">2</span>):</span><br><span class="line">    axs[i].tick_params(axis=<span class="string">&#x27;y&#x27;</span>, labelsize=<span class="number">15</span>)</span><br><span class="line">   </span><br><span class="line">axs[<span class="number">0</span>].set_title(<span class="string">&#x27;Title Feature Value Counts&#x27;</span>, size=<span class="number">20</span>, y=<span class="number">1.05</span>)</span><br><span class="line">   </span><br><span class="line">df_all[<span class="string">&#x27;Title&#x27;</span>] = df_all[<span class="string">&#x27;Title&#x27;</span>].replace([<span class="string">&#x27;Miss&#x27;</span>, <span class="string">&#x27;Mrs&#x27;</span>,<span class="string">&#x27;Ms&#x27;</span>, <span class="string">&#x27;Mlle&#x27;</span>, <span class="string">&#x27;Lady&#x27;</span>, <span class="string">&#x27;Mme&#x27;</span>, <span class="string">&#x27;the Countess&#x27;</span>, <span class="string">&#x27;Dona&#x27;</span>], <span class="string">&#x27;Miss/Mrs/Ms&#x27;</span>)</span><br><span class="line">df_all[<span class="string">&#x27;Title&#x27;</span>] = df_all[<span class="string">&#x27;Title&#x27;</span>].replace([<span class="string">&#x27;Dr&#x27;</span>, <span class="string">&#x27;Col&#x27;</span>, <span class="string">&#x27;Major&#x27;</span>, <span class="string">&#x27;Jonkheer&#x27;</span>, <span class="string">&#x27;Capt&#x27;</span>, <span class="string">&#x27;Sir&#x27;</span>, <span class="string">&#x27;Don&#x27;</span>, <span class="string">&#x27;Rev&#x27;</span>], <span class="string">&#x27;Dr/Military/Noble/Clergy&#x27;</span>)</span><br><span class="line">   </span><br><span class="line">sns.barplot(x=df_all[<span class="string">&#x27;Title&#x27;</span>].value_counts().index, y=df_all[<span class="string">&#x27;Title&#x27;</span>].value_counts().values, ax=axs[<span class="number">1</span>])</span><br><span class="line">axs[<span class="number">1</span>].set_title(<span class="string">&#x27;Title Feature Value Counts After Grouping&#x27;</span>, size=<span class="number">20</span>, y=<span class="number">1.05</span>)</span><br><span class="line">   </span><br><span class="line">plt.savefig(<span class="string">&quot;FE_title.png&quot;</span>)</span><br><span class="line">plt.close()</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/24.png"><br>用extract_surname来提取姓名中的姓。根据姓来创建Family特征，然后根据相同家庭来对乘客分组。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 根据姓名提取姓</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">extract_surname</span>(<span class="params">data</span>):</span></span><br><span class="line">    families = []</span><br><span class="line">    </span><br><span class="line">    <span class="keyword">for</span> i <span class="keyword">in</span> <span class="built_in">range</span>(<span class="built_in">len</span>(data)):</span><br><span class="line">        name = data.iloc[i]</span><br><span class="line">        </span><br><span class="line">        <span class="keyword">if</span> <span class="string">&#x27;(&#x27;</span> <span class="keyword">in</span> name:</span><br><span class="line">            name_no_bracket = name.split(<span class="string">&#x27;(&#x27;</span>)[<span class="number">0</span>]</span><br><span class="line">        <span class="keyword">else</span>:</span><br><span class="line">            name_no_bracket = name</span><br><span class="line">        </span><br><span class="line">        family = name_no_bracket.split(<span class="string">&#x27;(&#x27;</span>)[<span class="number">0</span>]</span><br><span class="line">        title = name_no_bracket.split(<span class="string">&#x27;,&#x27;</span>)[<span class="number">1</span>].strip().split(<span class="string">&#x27; &#x27;</span>)[<span class="number">0</span>]</span><br><span class="line">        </span><br><span class="line">        <span class="comment"># 将符号用空格代替</span></span><br><span class="line">        <span class="keyword">for</span> c <span class="keyword">in</span> string.punctuation:</span><br><span class="line">            family = family.replace(c, <span class="string">&#x27;&#x27;</span>).strip()</span><br><span class="line">            </span><br><span class="line">        families.append(family)</span><br><span class="line">        </span><br><span class="line">    <span class="keyword">return</span> families</span><br><span class="line"></span><br><span class="line"><span class="comment"># 根据Name特征建立Family特征</span></span><br><span class="line">df_all[<span class="string">&quot;Family&quot;</span>] = extract_surname(df_all[<span class="string">&quot;Name&quot;</span>])</span><br><span class="line">df_train, df_test = tools.divide_df(df_all)</span><br><span class="line">dfs = [df_train, df_test]</span><br></pre></td></tr></table></figure>
<p>Family_Survival_Rate是从训练集的Family特征中来的。因为测试集没有获救信息。建立一个在训练集和测试集中都存在的家庭名称的列表，计算这些家庭的获救率，保存于Family_Survival_Rate特征中。<br>再建一个Family_Survival_Rate_NA特征，用于只在测试集中存在的家庭。他们的家庭生存率无法计算。<br>Ticket_Survived_Rate和Ticket_Survived_Rate_NA特征用同样的方法计算。Ticket_Survived_Rate和Family_Survived_Rate计算平均值成为Survival_Rate，Ticket_Survived_Rate_NA和Family_Survival_Rate_NA也计算平均值保存于Survival_Rate_NA中。<br>具体见代码</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 检查同时在训练集和测试集中且成员数大于1的家庭</span></span><br><span class="line"><span class="keyword">for</span> i <span class="keyword">in</span> <span class="built_in">range</span>(<span class="built_in">len</span>(df_family_survival_rate)):</span><br><span class="line">    <span class="keyword">if</span> df_family_survival_rate.index[i] <span class="keyword">in</span> non_unique_families <span class="keyword">and</span> df_family_survival_rate.iloc[i, <span class="number">1</span>] &gt; <span class="number">1</span>:</span><br><span class="line">        family_rates[df_family_survival_rate.index[i]] = df_family_survival_rate.iloc[i, <span class="number">0</span>]</span><br><span class="line">       </span><br><span class="line"><span class="keyword">for</span> i <span class="keyword">in</span> <span class="built_in">range</span>(<span class="built_in">len</span>(df_ticket_survival_rate)):</span><br><span class="line">    <span class="keyword">if</span> df_ticket_survival_rate.index[i] <span class="keyword">in</span> non_unique_tickets <span class="keyword">and</span> df_ticket_survival_rate.iloc[i, <span class="number">1</span>] &gt; <span class="number">1</span>:</span><br><span class="line">        ticket_rates[df_ticket_survival_rate.index[i]] = df_ticket_survival_rate.iloc[i, <span class="number">0</span>]</span><br><span class="line">   </span><br><span class="line">mean_survival_rate = np.mean(df_train[<span class="string">&quot;Survived&quot;</span>])</span><br><span class="line">   </span><br><span class="line">train_family_survival_rate = []</span><br><span class="line">train_family_survival_rate_NA = []</span><br><span class="line">test_family_survival_rate = []</span><br><span class="line">test_family_survival_rate_NA = []</span><br><span class="line">   </span><br><span class="line"><span class="keyword">for</span> i <span class="keyword">in</span> <span class="built_in">range</span>(<span class="built_in">len</span>(df_train)):</span><br><span class="line">    <span class="keyword">if</span> df_train[<span class="string">&quot;Family&quot;</span>][i] <span class="keyword">in</span> family_rates:</span><br><span class="line">        train_family_survival_rate.append(family_rates[df_train[<span class="string">&quot;Family&quot;</span>][i]])</span><br><span class="line">        train_family_survival_rate_NA.append(<span class="number">1</span>)</span><br><span class="line">    <span class="keyword">else</span>:</span><br><span class="line">        train_family_survival_rate.append(mean_survival_rate)</span><br><span class="line">        train_family_survival_rate_NA.append(<span class="number">0</span>)</span><br><span class="line">       </span><br><span class="line"><span class="keyword">for</span> i <span class="keyword">in</span> <span class="built_in">range</span>(<span class="built_in">len</span>(df_test)):</span><br><span class="line">    <span class="keyword">if</span> df_test[<span class="string">&quot;Family&quot;</span>].iloc[i] <span class="keyword">in</span> family_rates:</span><br><span class="line">        test_family_survival_rate.append(family_rates[df_test[<span class="string">&quot;Family&quot;</span>].iloc[i]])</span><br><span class="line">        test_family_survival_rate_NA.append(<span class="number">1</span>)</span><br><span class="line">    <span class="keyword">else</span>:</span><br><span class="line">        test_family_survival_rate.append(mean_survival_rate)</span><br><span class="line">        test_family_survival_rate_NA.append(<span class="number">0</span>)</span><br><span class="line">       </span><br><span class="line">df_train[<span class="string">&quot;Family_Survival_Rate&quot;</span>] = train_family_survival_rate</span><br><span class="line">df_train[<span class="string">&quot;Family_Survival_Rate_NA&quot;</span>] = train_family_survival_rate_NA</span><br><span class="line">df_test[<span class="string">&quot;Family_Survival_Rate&quot;</span>] = test_family_survival_rate</span><br><span class="line">df_test[<span class="string">&quot;Family_Survival_Rate_NA&quot;</span>] = test_family_survival_rate_NA</span><br><span class="line">   </span><br><span class="line">train_ticket_survival_rate = []</span><br><span class="line">train_ticket_survival_rate_NA = []</span><br><span class="line">test_ticket_survival_rate = []</span><br><span class="line">test_ticket_survival_rate_NA = []</span><br><span class="line">   </span><br><span class="line"><span class="keyword">for</span> i <span class="keyword">in</span> <span class="built_in">range</span>(<span class="built_in">len</span>(df_train)):</span><br><span class="line">    <span class="keyword">if</span> df_train[<span class="string">&quot;Ticket&quot;</span>][i] <span class="keyword">in</span> ticket_rates:</span><br><span class="line">        train_ticket_survival_rate.append(ticket_rates[df_train[<span class="string">&quot;Ticket&quot;</span>][i]])</span><br><span class="line">        train_ticket_survival_rate_NA.append(<span class="number">1</span>)</span><br><span class="line">    <span class="keyword">else</span>:</span><br><span class="line">        train_ticket_survival_rate.append(mean_survival_rate)</span><br><span class="line">        train_ticket_survival_rate_NA.append(<span class="number">0</span>)</span><br><span class="line">       </span><br><span class="line"><span class="keyword">for</span> i <span class="keyword">in</span> <span class="built_in">range</span>(<span class="built_in">len</span>(df_test)):</span><br><span class="line">    <span class="keyword">if</span> df_test[<span class="string">&quot;Ticket&quot;</span>].iloc[i] <span class="keyword">in</span> ticket_rates:</span><br><span class="line">        test_ticket_survival_rate.append(ticket_rates[df_test[<span class="string">&quot;Ticket&quot;</span>].iloc[i]])</span><br><span class="line">        test_ticket_survival_rate_NA.append(<span class="number">1</span>)</span><br><span class="line">    <span class="keyword">else</span>:</span><br><span class="line">        test_ticket_survival_rate.append(mean_survival_rate)</span><br><span class="line">        test_ticket_survival_rate_NA.append(<span class="number">0</span>)</span><br><span class="line">       </span><br><span class="line">df_train[<span class="string">&quot;Ticket_Survival_Rate&quot;</span>] = train_ticket_survival_rate</span><br><span class="line">df_train[<span class="string">&quot;Ticket_Survival_Rate_NA&quot;</span>] = train_ticket_survival_rate_NA</span><br><span class="line">df_test[<span class="string">&quot;Ticket_Survival_Rate&quot;</span>] = test_ticket_survival_rate</span><br><span class="line">df_test[<span class="string">&quot;Ticket_Survival_Rate_NA&quot;</span>] = test_ticket_survival_rate_NA</span><br><span class="line">   </span><br><span class="line"><span class="keyword">for</span> df <span class="keyword">in</span> [df_train, df_test]:</span><br><span class="line">    df[<span class="string">&quot;Survival_Rate&quot;</span>] = (df[<span class="string">&quot;Ticket_Survival_Rate&quot;</span>] + df[<span class="string">&quot;Family_Survival_Rate&quot;</span>]) / <span class="number">2</span></span><br><span class="line">    df[<span class="string">&quot;Survival_Rate_NA&quot;</span>] = (df[<span class="string">&quot;Ticket_Survival_Rate_NA&quot;</span>] + df[<span class="string">&quot;Family_Survival_Rate_NA&quot;</span>]) / <span class="number">2</span></span><br></pre></td></tr></table></figure>
<p>特征工程最后是进行特征转换。<br>先标记编码非数值特征，主要是Embarked,Sex,Deck,Title和Family_Size_Grouped等特征，Age和Fare是分类特征。使用LabelEncoder将其转换为数值类型。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 使用LabelEncoder将分类特征转换为数值类型</span></span><br><span class="line">non_numeric_features = [<span class="string">&quot;Embarked&quot;</span>, <span class="string">&quot;Sex&quot;</span>, <span class="string">&quot;Deck&quot;</span>, <span class="string">&quot;Title&quot;</span>, <span class="string">&quot;Family_Size_Grouped&quot;</span>, <span class="string">&quot;Age&quot;</span>, <span class="string">&quot;Fare&quot;</span>]</span><br><span class="line"><span class="keyword">for</span> df <span class="keyword">in</span> dfs:</span><br><span class="line">    <span class="keyword">for</span> feature <span class="keyword">in</span> non_numeric_features:</span><br><span class="line">        df[feature] = LabelEncoder().fit_transform(df[feature])</span><br></pre></td></tr></table></figure>
<p>最后，使用独热编码(One-Hot Encoding)处理分类特征。<br>使用OneHotEncoder。之所以不直接赋值为1，2，3，……是因为分类器往往默认数据是连续的，并且是有序的。<br>具体代码</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br></pre></td><td class="code"><pre><span class="line"># 使用独热编码处理分类特征</span><br><span class="line">cat_features &#x3D; [&quot;Pclass&quot;, &quot;Sex&quot;, &quot;Deck&quot;, &quot;Embarked&quot;, &quot;Title&quot;, &quot;Family_Size_Grouped&quot;]</span><br><span class="line">encoded_features &#x3D; []</span><br><span class="line"></span><br><span class="line">for df in dfs:</span><br><span class="line">    for feature in cat_features:</span><br><span class="line">        encoded_feat &#x3D; OneHotEncoder().fit_transform(df[feature].values.reshape(-1,1)).toarray()</span><br><span class="line">        n &#x3D; df[feature].nunique()</span><br><span class="line">        cols &#x3D; [&quot;&#123;&#125;_&#123;&#125;&quot;.format(feature, n) for n in range(1, n+1)]</span><br><span class="line">        encoded_df &#x3D; pd.DataFrame(encoded_feat, columns &#x3D; cols)</span><br><span class="line">        encoded_df.index &#x3D; df.index</span><br><span class="line">        encoded_features.append(encoded_df)</span><br><span class="line">        </span><br><span class="line">df_train &#x3D; pd.concat([df_train, *encoded_features[:6]], axis &#x3D; 1)</span><br><span class="line">df_test &#x3D; pd.concat([df_test, *encoded_features[6:]], axis &#x3D; 1)</span><br></pre></td></tr></table></figure>
<p>特征工程小结:将Age和Fare进行了分组，通过Parch和SibSp组合生成了Family_Size，Ticket_Frequency显示了Ticket的出现频率。Name特征非常有用，衍生出几个特征。最后是处理了一些分类变量。<br>现在把数据组合起来，输出看看。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 将数据组合，保留有用的特征</span></span><br><span class="line">df_all = tools.concat_df(df_train, df_test)</span><br><span class="line">drop_cols = [<span class="string">&#x27;Deck&#x27;</span>, <span class="string">&#x27;Embarked&#x27;</span>, <span class="string">&#x27;Family&#x27;</span>, <span class="string">&#x27;Family_Size&#x27;</span>, <span class="string">&#x27;Family_Size_Grouped&#x27;</span>, <span class="string">&#x27;Survived&#x27;</span>, <span class="string">&#x27;Name&#x27;</span>, <span class="string">&#x27;Parch&#x27;</span>, <span class="string">&#x27;PassengerId&#x27;</span>, <span class="string">&#x27;Pclass&#x27;</span>, <span class="string">&#x27;Sex&#x27;</span>, <span class="string">&#x27;SibSp&#x27;</span>, <span class="string">&#x27;Ticket&#x27;</span>, <span class="string">&#x27;Title&#x27;</span>, <span class="string">&#x27;Ticket_Survival_Rate&#x27;</span>,<span class="string">&#x27;Family_Survival_Rate&#x27;</span>, <span class="string">&#x27;Ticket_Survival_Rate_NA&#x27;</span>, <span class="string">&#x27;Family_Survival_Rate_NA&#x27;</span>]</span><br><span class="line">df_all.drop(columns = drop_cols, inplace = <span class="literal">True</span>)</span><br><span class="line">print(df_all.head())</span><br><span class="line">print(df_all.info())</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/25.png"><br>4.建模<br>终于到了最激动人心的一步了，开始建模。<br>先处理数据</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 划分数据</span></span><br><span class="line">df_train, df_test = tools.divide_df(df_all)</span><br><span class="line">X_train = StandardScaler().fit_transform(df_train.drop([<span class="string">&quot;Survived&quot;</span>], axis = <span class="number">1</span>))</span><br><span class="line">y_train = df_train[<span class="string">&quot;Survived&quot;</span>].values</span><br><span class="line">X_test = StandardScaler().fit_transform(df_test)</span><br><span class="line"></span><br><span class="line">print(<span class="string">&#x27;X_train shape: &#123;&#125;&#x27;</span>.<span class="built_in">format</span>(X_train.shape))</span><br><span class="line">print(<span class="string">&#x27;y_train shape: &#123;&#125;&#x27;</span>.<span class="built_in">format</span>(y_train.shape))</span><br><span class="line">print(<span class="string">&#x27;X_test shape: &#123;&#125;&#x27;</span>.<span class="built_in">format</span>(X_test.shape))</span><br></pre></td></tr></table></figure>
<p>结果<br>X_train shape: (891, 26)<br>y_train shape: (891,)<br>X_test shape: (418, 26)<br>采用随机树森林模型，建立两个模型，一个是单独的模型，另一个是k折叠交叉验证的模型。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># ④建模</span></span><br><span class="line"><span class="comment"># 划分数据</span></span><br><span class="line">df_train, df_test = tools.divide_df(df_all)</span><br><span class="line">X_train = StandardScaler().fit_transform(df_train.drop([<span class="string">&quot;Survived&quot;</span>], axis = <span class="number">1</span>))</span><br><span class="line">y_train = df_train[<span class="string">&quot;Survived&quot;</span>].values</span><br><span class="line">X_test = StandardScaler().fit_transform(df_test)</span><br><span class="line">   </span><br><span class="line">print(<span class="string">&#x27;X_train shape: &#123;&#125;&#x27;</span>.<span class="built_in">format</span>(X_train.shape))</span><br><span class="line">print(<span class="string">&#x27;y_train shape: &#123;&#125;&#x27;</span>.<span class="built_in">format</span>(y_train.shape))</span><br><span class="line">print(<span class="string">&#x27;X_test shape: &#123;&#125;&#x27;</span>.<span class="built_in">format</span>(X_test.shape))</span><br><span class="line"></span><br><span class="line">single_best_model = RFC(criterion = <span class="string">&quot;gini&quot;</span>, n_estimators = <span class="number">1100</span>, max_depth = <span class="number">5</span>, min_samples_split=<span class="number">4</span>, min_samples_leaf=<span class="number">5</span>, max_features=<span class="string">&#x27;auto&#x27;</span>, oob_score=<span class="literal">True</span>, random_state=SEED, n_jobs=-<span class="number">1</span>, verbose=<span class="number">1</span>)</span><br><span class="line">leaderboard_model = RFC(criterion = <span class="string">&quot;gini&quot;</span>, n_estimators = <span class="number">1750</span>, max_depth = <span class="number">7</span>, min_samples_split=<span class="number">6</span>, min_samples_leaf=<span class="number">6</span>, max_features=<span class="string">&#x27;auto&#x27;</span>, oob_score=<span class="literal">True</span>, random_state=SEED, n_jobs=-<span class="number">1</span>, verbose=<span class="number">1</span>)</span><br><span class="line">N = <span class="number">5</span></span><br><span class="line">oob = <span class="number">0</span></span><br><span class="line">probs = pd.DataFrame(np.zeros((<span class="built_in">len</span>(X_test), N*<span class="number">2</span>)), columns = [<span class="string">&#x27;Fold_&#123;&#125;_Prob_&#123;&#125;&#x27;</span>.<span class="built_in">format</span>(i, j) <span class="keyword">for</span> i <span class="keyword">in</span> <span class="built_in">range</span>(<span class="number">1</span>, N + <span class="number">1</span>) <span class="keyword">for</span> j <span class="keyword">in</span> <span class="built_in">range</span>(<span class="number">2</span>)])</span><br><span class="line">df_temp = df_all.drop([<span class="string">&quot;Survived&quot;</span>], axis = <span class="number">1</span>)</span><br><span class="line">importances = pd.DataFrame(np.zeros((X_train.shape[<span class="number">1</span>], N)), columns=[<span class="string">&#x27;Fold_&#123;&#125;&#x27;</span>.<span class="built_in">format</span>(i) <span class="keyword">for</span> i <span class="keyword">in</span> <span class="built_in">range</span>(<span class="number">1</span>, N + <span class="number">1</span>)], index=df_temp.columns)</span><br><span class="line">fprs, tprs, scores = [], [], []</span><br><span class="line">skf = StratifiedKFold(n_splits=N, random_state=N, shuffle=<span class="literal">True</span>)</span><br><span class="line">   </span><br><span class="line"><span class="keyword">for</span> fold, (trn_idx, val_idx) <span class="keyword">in</span> <span class="built_in">enumerate</span>(skf.split(X_train, y_train), <span class="number">1</span>):</span><br><span class="line">    print(<span class="string">&#x27;Fold &#123;&#125;\n&#x27;</span>.<span class="built_in">format</span>(fold))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># 模型拟合</span></span><br><span class="line">    leaderboard_model.fit(X_train[trn_idx], y_train[trn_idx])</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># 计算训练的AUC分数</span></span><br><span class="line">    trn_fpr, trn_tpr, trn_thresholds = roc_curve(y_train[trn_idx], leaderboard_model.predict_proba(X_train[trn_idx])[:, <span class="number">1</span>])</span><br><span class="line">    trn_auc_score = auc(trn_fpr, trn_tpr)</span><br><span class="line">    <span class="comment"># 计算检验的AUC分数</span></span><br><span class="line">    val_fpr, val_tpr, val_thresholds = roc_curve(y_train[val_idx], leaderboard_model.predict_proba(X_train[val_idx])[:, <span class="number">1</span>])</span><br><span class="line">    val_auc_score = auc(val_fpr, val_tpr)</span><br><span class="line">   </span><br><span class="line">    scores.append((trn_auc_score, val_auc_score))</span><br><span class="line">    fprs.append(val_fpr)</span><br><span class="line">    tprs.append(val_tpr)</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># X_test概率</span></span><br><span class="line">    probs.loc[:, <span class="string">&#x27;Fold_&#123;&#125;_Prob_0&#x27;</span>.<span class="built_in">format</span>(fold)] = leaderboard_model.predict_proba(X_test)[:, <span class="number">0</span>]</span><br><span class="line">    probs.loc[:, <span class="string">&#x27;Fold_&#123;&#125;_Prob_1&#x27;</span>.<span class="built_in">format</span>(fold)] = leaderboard_model.predict_proba(X_test)[:, <span class="number">1</span>]</span><br><span class="line">    importances.iloc[:, fold - <span class="number">1</span>] = leaderboard_model.feature_importances_</span><br><span class="line">   </span><br><span class="line">    oob += leaderboard_model.oob_score_ / N</span><br><span class="line">    print(<span class="string">&#x27;Fold &#123;&#125; OOB Score: &#123;&#125;\n&#x27;</span>.<span class="built_in">format</span>(fold, leaderboard_model.oob_score_))</span><br><span class="line">   </span><br><span class="line">print(<span class="string">&#x27;Average OOB Score: &#123;&#125;&#x27;</span>.<span class="built_in">format</span>(oob))</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/26.png"><br>运行结果，平均评分为0.84。这段说实话没太懂，原文照抄了。<br>画图看看。先画出特征在模型中的重要性。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 画图看看</span></span><br><span class="line">importances[<span class="string">&#x27;Mean_Importance&#x27;</span>] = importances.mean(axis=<span class="number">1</span>)</span><br><span class="line">importances.sort_values(by=<span class="string">&#x27;Mean_Importance&#x27;</span>, inplace=<span class="literal">True</span>, ascending=<span class="literal">False</span>)</span><br><span class="line"></span><br><span class="line">plt.figure(figsize=(<span class="number">15</span>, <span class="number">20</span>))</span><br><span class="line">sns.barplot(x=<span class="string">&#x27;Mean_Importance&#x27;</span>, y=importances.index, data=importances)</span><br><span class="line"></span><br><span class="line">plt.xlabel(<span class="string">&#x27;&#x27;</span>)</span><br><span class="line">plt.tick_params(axis=<span class="string">&#x27;x&#x27;</span>, labelsize=<span class="number">15</span>)</span><br><span class="line">plt.tick_params(axis=<span class="string">&#x27;y&#x27;</span>, labelsize=<span class="number">15</span>)</span><br><span class="line">plt.title(<span class="string">&#x27;Random Forest Classifier Mean Feature Importance Between Folds&#x27;</span>, size=<span class="number">15</span>)</span><br><span class="line"></span><br><span class="line">plt.savefig(<span class="string">&quot;RandomForest.png&quot;</span>)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/27.png"></p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 画ROC曲线</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">plot_roc_curve</span>(<span class="params">fprs, tprs</span>):</span></span><br><span class="line">   </span><br><span class="line">    tprs_interp = []</span><br><span class="line">    aucs = []</span><br><span class="line">    mean_fpr = np.linspace(<span class="number">0</span>, <span class="number">1</span>, <span class="number">100</span>)</span><br><span class="line">    f, ax = plt.subplots(figsize=(<span class="number">15</span>, <span class="number">15</span>))</span><br><span class="line">  </span><br><span class="line">   <span class="comment"># 为每次折叠测试画ROC曲线并计算AUC值</span></span><br><span class="line">    <span class="keyword">for</span> i, (fpr, tpr) <span class="keyword">in</span> <span class="built_in">enumerate</span>(<span class="built_in">zip</span>(fprs, tprs), <span class="number">1</span>):</span><br><span class="line">        tprs_interp.append(np.interp(mean_fpr, fpr, tpr))</span><br><span class="line">        tprs_interp[-<span class="number">1</span>][<span class="number">0</span>] = <span class="number">0.0</span></span><br><span class="line">        roc_auc = auc(fpr, tpr)</span><br><span class="line">        aucs.append(roc_auc)</span><br><span class="line">        ax.plot(fpr, tpr, lw=<span class="number">1</span>, alpha=<span class="number">0.3</span>, label=<span class="string">&#x27;ROC Fold &#123;&#125; (AUC = &#123;:.3f&#125;)&#x27;</span>.<span class="built_in">format</span>(i, roc_auc))</span><br><span class="line">       </span><br><span class="line">    <span class="comment"># 为随机猜测画ROC图</span></span><br><span class="line">    plt.plot([<span class="number">0</span>, <span class="number">1</span>], [<span class="number">0</span>, <span class="number">1</span>], linestyle=<span class="string">&#x27;--&#x27;</span>, lw=<span class="number">2</span>, color=<span class="string">&#x27;r&#x27;</span>, alpha=<span class="number">0.8</span>, label=<span class="string">&#x27;Random Guessing&#x27;</span>)</span><br><span class="line">   </span><br><span class="line">    mean_tpr = np.mean(tprs_interp, axis=<span class="number">0</span>)</span><br><span class="line">    mean_tpr[-<span class="number">1</span>] = <span class="number">1.0</span></span><br><span class="line">    mean_auc = auc(mean_fpr, mean_tpr)</span><br><span class="line">    std_auc = np.std(aucs)</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># 画平均ROC值</span></span><br><span class="line">    ax.plot(mean_fpr, mean_tpr, color=<span class="string">&#x27;b&#x27;</span>, label=<span class="string">&#x27;Mean ROC (AUC = &#123;:.3f&#125; $\pm$ &#123;:.3f&#125;)&#x27;</span>.<span class="built_in">format</span>(mean_auc, std_auc), lw=<span class="number">2</span>, alpha=<span class="number">0.8</span>)</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># 画平均ROC的标准差</span></span><br><span class="line">    std_tpr = np.std(tprs_interp, axis=<span class="number">0</span>)</span><br><span class="line">    tprs_upper = np.minimum(mean_tpr + std_tpr, <span class="number">1</span>)</span><br><span class="line">    tprs_lower = np.maximum(mean_tpr - std_tpr, <span class="number">0</span>)</span><br><span class="line">    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color=<span class="string">&#x27;grey&#x27;</span>, alpha=<span class="number">.2</span>, label=<span class="string">&#x27;$\pm$ 1 std. dev.&#x27;</span>)</span><br><span class="line">   </span><br><span class="line">    ax.set_xlabel(<span class="string">&#x27;False Positive Rate&#x27;</span>, size=<span class="number">15</span>, labelpad=<span class="number">20</span>)</span><br><span class="line">    ax.set_ylabel(<span class="string">&#x27;True Positive Rate&#x27;</span>, size=<span class="number">15</span>, labelpad=<span class="number">20</span>)</span><br><span class="line">    ax.tick_params(axis=<span class="string">&#x27;x&#x27;</span>, labelsize=<span class="number">15</span>)</span><br><span class="line">    ax.tick_params(axis=<span class="string">&#x27;y&#x27;</span>, labelsize=<span class="number">15</span>)</span><br><span class="line">    ax.set_xlim([-<span class="number">0.05</span>, <span class="number">1.05</span>])</span><br><span class="line">    ax.set_ylim([-<span class="number">0.05</span>, <span class="number">1.05</span>])</span><br><span class="line"></span><br><span class="line">    ax.set_title(<span class="string">&#x27;ROC Curves of Folds&#x27;</span>, size=<span class="number">20</span>, y=<span class="number">1.02</span>)</span><br><span class="line">    ax.legend(loc=<span class="string">&#x27;lower right&#x27;</span>, prop=&#123;<span class="string">&#x27;size&#x27;</span>: <span class="number">13</span>&#125;)</span><br><span class="line">   </span><br><span class="line">    plt.savefig(<span class="string">&quot;ROC.png&quot;</span>)</span><br><span class="line"></span><br><span class="line">plot_roc_curve(fprs, tprs)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/28.png"><br>最后，进行预测，提交kaggle。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 预测结果提交</span></span><br><span class="line">class_survived = [col <span class="keyword">for</span> col <span class="keyword">in</span> probs.columns <span class="keyword">if</span> col.endswith(<span class="string">&#x27;Prob_1&#x27;</span>)]</span><br><span class="line">probs[<span class="string">&#x27;1&#x27;</span>] = probs[class_survived].<span class="built_in">sum</span>(axis=<span class="number">1</span>) / N</span><br><span class="line">probs[<span class="string">&#x27;0&#x27;</span>] = probs.drop(columns=class_survived).<span class="built_in">sum</span>(axis=<span class="number">1</span>) / N</span><br><span class="line">probs[<span class="string">&#x27;pred&#x27;</span>] = <span class="number">0</span></span><br><span class="line">pos = probs[probs[<span class="string">&#x27;1&#x27;</span>] &gt;= <span class="number">0.5</span>].index</span><br><span class="line">probs.loc[pos, <span class="string">&#x27;pred&#x27;</span>] = <span class="number">1</span></span><br><span class="line"></span><br><span class="line">y_pred = probs[<span class="string">&#x27;pred&#x27;</span>].astype(<span class="built_in">int</span>)</span><br><span class="line"></span><br><span class="line">submission_df = pd.DataFrame(columns=[<span class="string">&#x27;PassengerId&#x27;</span>, <span class="string">&#x27;Survived&#x27;</span>])</span><br><span class="line">submission_df[<span class="string">&#x27;PassengerId&#x27;</span>] = df_test[<span class="string">&#x27;PassengerId&#x27;</span>]</span><br><span class="line">submission_df[<span class="string">&#x27;Survived&#x27;</span>] = y_pred.values</span><br><span class="line">submission_df.to_csv(<span class="string">&#x27;submissions.csv&#x27;</span>, header=<span class="literal">True</span>, index=<span class="literal">False</span>)</span><br><span class="line">print(submission_df.head(<span class="number">10</span>))</span><br></pre></td></tr></table></figure>
<p>提交到kaggle看看<br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/29.png"><br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/30.png"><br>0.80分，排1222名。比我自己之前做的都好了。再看看怎么改进吧。<br>总结一下跟着大神的kernel走一遍的收获:<br>①进行数据清洗和特征工程之前要将训练数据和测试数据合并到一起，防止二者之间出现偏差。<br>②处理缺失数据，要根据数据的特性选择合适的方法。比如年龄数据的缺失值，根据姓名称呼用不同年龄段的乘客的中位数就要比简单用所有乘客的年龄中位数要准确一些。也可以获取一些额外的数据，比如通过谷歌搜索用真实值来填充缺失数据。<br>③相关领域的知识越多越有利于特征工程。比如泰坦尼克号船舱分布的知识，用来处理Cabin的缺失数据。<br>④进行数据分析和特征工程时可以根据数据特征新建新的属性以更好的整合数据的信息，使其规律更加明显。<br>⑤分类特征可以用独热编码One-Hot-Encoding进行转换，这样能形成连续有序的数据。<br>接下来，再参考其它文章进行一些改进吧。<br>先照这篇文章写一个画模型的学习曲线的函数[2]</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 绘制模型的学习曲线</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">plot_learning_curve</span>(<span class="params">estimator, title, X, y, ylim=<span class="literal">None</span>, cv=<span class="literal">None</span>, n_jobs=<span class="number">1</span>,train_sizes=np.linspace(<span class="params"><span class="number">.1</span>, <span class="number">1.0</span>, <span class="number">5</span></span>),verbose=<span class="number">0</span></span>):</span></span><br><span class="line">    plt.figure()</span><br><span class="line">    plt.title(title)</span><br><span class="line">    <span class="keyword">if</span> ylim <span class="keyword">is</span> <span class="keyword">not</span> <span class="literal">None</span>:</span><br><span class="line">        plt.ylim(*ylim)</span><br><span class="line">    plt.xlabel(<span class="string">&quot;Training examples&quot;</span>)</span><br><span class="line">    plt.ylabel(<span class="string">&quot;Score&quot;</span>)</span><br><span class="line">    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv,n_jobs=n_jobs, train_sizes=train_sizes)</span><br><span class="line">    train_scores_mean = np.mean(train_scores, axis=<span class="number">1</span>)</span><br><span class="line">    train_scores_std = np.std(train_scores, axis=<span class="number">1</span>)</span><br><span class="line">    test_scores_mean = np.mean(test_scores, axis=<span class="number">1</span>)</span><br><span class="line">    test_scores_std = np.std(test_scores, axis=<span class="number">1</span>)</span><br><span class="line">    plt.grid()</span><br><span class="line"></span><br><span class="line">    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,train_scores_mean + train_scores_std, alpha=<span class="number">0.1</span>, color=<span class="string">&quot;r&quot;</span>)</span><br><span class="line">    plt.fill_between(train_sizes,test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=<span class="number">0.1</span>, color=<span class="string">&quot;g&quot;</span>)</span><br><span class="line">    plt.plot(train_sizes, train_scores_mean, <span class="string">&#x27;o-&#x27;</span>, color=<span class="string">&quot;r&quot;</span>, label=<span class="string">&quot;Training score&quot;</span>)</span><br><span class="line">    plt.plot(train_sizes, test_scores_mean, <span class="string">&#x27;o-&#x27;</span>, color=<span class="string">&quot;g&quot;</span>, label=<span class="string">&quot;Cross-validation score&quot;</span>)</span><br><span class="line"></span><br><span class="line">    plt.legend(loc=<span class="string">&quot;best&quot;</span>)</span><br><span class="line">    <span class="keyword">return</span> plt</span><br><span class="line"></span><br><span class="line">    <span class="comment"># ⑤模型评估</span></span><br><span class="line">    rf_parameters = &#123;<span class="string">&quot;criterion&quot;</span>:<span class="string">&quot;gini&quot;</span>, <span class="string">&quot;n_estimators&quot;</span>:<span class="number">1750</span>, <span class="string">&quot;max_depth&quot;</span>:<span class="number">7</span>, <span class="string">&quot;min_samples_split&quot;</span>:<span class="number">6</span>, <span class="string">&quot;min_samples_leaf&quot;</span>:<span class="number">6</span>, <span class="string">&quot;max_features&quot;</span>:<span class="string">&#x27;auto&#x27;</span>, <span class="string">&quot;oob_score&quot;</span>:<span class="literal">True</span>, <span class="string">&quot;random_state&quot;</span>:SEED, <span class="string">&quot;n_jobs&quot;</span>:-<span class="number">1</span>, <span class="string">&quot;verbose&quot;</span>:<span class="number">1</span>&#125;</span><br><span class="line">    title = <span class="string">&quot;RandomForest&quot;</span></span><br><span class="line">    df_train, df_test = tools.divide_df(df_all)</span><br><span class="line">    X_train = StandardScaler().fit_transform(df_train.drop([<span class="string">&quot;Survived&quot;</span>], axis = <span class="number">1</span>))</span><br><span class="line">    y_train = df_train[<span class="string">&quot;Survived&quot;</span>].values</span><br><span class="line">    plt = plot_learning_curve(RFC(**rf_parameters), title, X_train, y_train, cv=<span class="literal">None</span>, n_jobs=-<span class="number">1</span>, train_sizes=[<span class="number">50</span>, <span class="number">100</span>, <span class="number">150</span>, <span class="number">200</span>, <span class="number">250</span>, <span class="number">350</span>, <span class="number">400</span>, <span class="number">450</span>, <span class="number">500</span>])</span><br><span class="line">    plt.savefig(<span class="string">&quot;learningCurve.png&quot;</span>)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/31.png"><br>接下来就测试不同的模型了。<br>根据这篇文章[3]，这个问题属于有监督学习的分类问题，可以使用的算法有:</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> sklearn.linear_model <span class="keyword">import</span> LogisticRegression    <span class="comment">#逻辑回归</span></span><br><span class="line"><span class="keyword">from</span> sklearn.svm <span class="keyword">import</span> SVC, LinearSVC                 <span class="comment">#支持向量机</span></span><br><span class="line"><span class="keyword">from</span> sklearn.ensemble <span class="keyword">import</span> RandomForestClassifier    <span class="comment">#随机森林</span></span><br><span class="line"><span class="keyword">from</span> sklearn.neighbors <span class="keyword">import</span> KNeighborsClassifier     <span class="comment">#K最邻近算法</span></span><br><span class="line"><span class="keyword">from</span> sklearn.naive_bayes <span class="keyword">import</span> GaussianNB             <span class="comment">#朴素贝叶斯</span></span><br><span class="line"><span class="keyword">from</span> sklearn.linear_model <span class="keyword">import</span> Perceptron            <span class="comment">#感知机算法            </span></span><br><span class="line"><span class="keyword">from</span> sklearn.linear_model <span class="keyword">import</span> SGDClassifier         <span class="comment">#梯度下降分类</span></span><br><span class="line"><span class="keyword">from</span> sklearn.tree <span class="keyword">import</span> DecisionTreeClassifier        <span class="comment">#决策树算法</span></span><br><span class="line"><span class="keyword">from</span> sklearn.model_selection <span class="keyword">import</span> StratifiedKFold    <span class="comment">#K折交叉切分</span></span><br><span class="line"><span class="keyword">from</span> sklearn.model_selection <span class="keyword">import</span> GridSearchCV       <span class="comment">#网格搜索</span></span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> tools</span><br><span class="line"><span class="keyword">import</span> numpy <span class="keyword">as</span> np</span><br><span class="line"><span class="keyword">import</span> pandas <span class="keyword">as</span> pd</span><br></pre></td></tr></table></figure>
<p>挨个试吧。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 模型测试</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">ModelTest</span>(<span class="params">Model, X_train, Y_train</span>):</span></span><br><span class="line">    Model.fit(X_train, Y_train)</span><br><span class="line">    <span class="comment"># 对模型评分</span></span><br><span class="line">    acc_result = <span class="built_in">round</span>(Model.score(X_train, Y_train)*<span class="number">100</span>, <span class="number">2</span>)</span><br><span class="line">    <span class="keyword">return</span> acc_result</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="comment"># 尝试各种模型</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">model_compare</span>(<span class="params">df_all</span>):</span></span><br><span class="line">    <span class="comment"># 划分数据</span></span><br><span class="line">    train_df, test_df = tools.divide_df(df_all)</span><br><span class="line">    X_train = train_df.drop([<span class="string">&quot;Survived&quot;</span>, <span class="string">&quot;PassengerId&quot;</span>], axis = <span class="number">1</span>)</span><br><span class="line">    Y_train = train_df[<span class="string">&quot;Survived&quot;</span>]</span><br><span class="line">    X_test = test_df</span><br><span class="line">    print(X_train.shape, Y_train.shape, X_test.shape)</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># 逻辑回归模型</span></span><br><span class="line">    LogModel = LogisticRegression()</span><br><span class="line">    acc_log = ModelTest(LogModel, X_train, Y_train)</span><br><span class="line">    print(<span class="string">&quot;逻辑回归结果:&#123;&#125;&quot;</span>.<span class="built_in">format</span>(acc_log))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># SVM支持向量机模型</span></span><br><span class="line">    SVMModel = SVC()</span><br><span class="line">    acc_svc = ModelTest(SVMModel, X_train, Y_train)</span><br><span class="line">    print(<span class="string">&quot;支持向量机结果:&#123;&#125;&quot;</span>.<span class="built_in">format</span>(acc_svc))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># knn算法</span></span><br><span class="line">    knnModel = KNeighborsClassifier(n_neighbors = <span class="number">3</span>)</span><br><span class="line">    acc_knn = ModelTest(knnModel, X_train, Y_train)</span><br><span class="line">    print(<span class="string">&quot;knn结果:&#123;&#125;&quot;</span>.<span class="built_in">format</span>(acc_knn))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># 朴素贝叶斯模型</span></span><br><span class="line">    BYSModel = GaussianNB()</span><br><span class="line">    acc_bys = ModelTest(BYSModel, X_train, Y_train)</span><br><span class="line">    print(<span class="string">&quot;朴素贝叶斯算法结果:&#123;&#125;&quot;</span>.<span class="built_in">format</span>(acc_bys))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># 感知机算法</span></span><br><span class="line">    percModel = Perceptron()</span><br><span class="line">    acc_perc = ModelTest(percModel, X_train, Y_train)</span><br><span class="line">    print(<span class="string">&quot;感知机算法算法结果:&#123;&#125;&quot;</span>.<span class="built_in">format</span>(acc_perc))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># 线性分类支持向量机</span></span><br><span class="line">    lin_svcModel = LinearSVC()</span><br><span class="line">    acc_lin_svc = ModelTest(lin_svcModel, X_train, Y_train)</span><br><span class="line">    print(<span class="string">&quot;线性分类支持向量机算法结果:&#123;&#125;&quot;</span>.<span class="built_in">format</span>(acc_lin_svc))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># 梯度下降分类算法</span></span><br><span class="line">    sgdModel = SGDClassifier()</span><br><span class="line">    acc_sgd = ModelTest(sgdModel, X_train, Y_train)</span><br><span class="line">    print(<span class="string">&quot;梯度下降分类算法结果:&#123;&#125;&quot;</span>.<span class="built_in">format</span>(acc_sgd))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># 决策树算法</span></span><br><span class="line">    treeModel = DecisionTreeClassifier()</span><br><span class="line">    acc_tree = ModelTest(treeModel, X_train, Y_train)</span><br><span class="line">    print(<span class="string">&quot;决策树算法结果:&#123;&#125;&quot;</span>.<span class="built_in">format</span>(acc_tree))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># 随机森林算法</span></span><br><span class="line">    forestModel = RandomForestClassifier()</span><br><span class="line">    acc_rand = ModelTest(forestModel, X_train, Y_train)</span><br><span class="line">    print(<span class="string">&quot;随机森林算法结果:&#123;&#125;&quot;</span>.<span class="built_in">format</span>(acc_rand))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># 模型评分</span></span><br><span class="line">    models = pd.DataFrame(&#123;</span><br><span class="line">    <span class="string">&#x27;Model&#x27;</span>: [<span class="string">&#x27;Support Vector Machines&#x27;</span>, <span class="string">&#x27;KNN&#x27;</span>, <span class="string">&#x27;Logistic Regression&#x27;</span>, <span class="string">&#x27;Random Forest&#x27;</span>, <span class="string">&#x27;Naive Bayes&#x27;</span>, <span class="string">&#x27;Perceptron&#x27;</span>, <span class="string">&#x27;Stochastic Gradient Decent&#x27;</span>, <span class="string">&#x27;Linear SVC&#x27;</span>,<span class="string">&#x27;Decision Tree&#x27;</span>],</span><br><span class="line">    <span class="string">&#x27;Score&#x27;</span>: [acc_svc, acc_knn, acc_log, acc_rand, acc_bys, acc_perc, acc_sgd, acc_lin_svc, acc_tree]&#125;)</span><br><span class="line">    print(models.sort_values(by=<span class="string">&#x27;Score&#x27;</span>, ascending=<span class="literal">False</span>))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># 用决策树模型进行预测</span></span><br><span class="line">    tools.Submission(treeModel, test_df, <span class="string">&quot;decisetree.csv&quot;</span>)</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/32.png"><br>几种算法中决策树算法的评分最高，生成预测结果提交试试?<br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/33.png"><br>还不如随机森林算法呢。<br>再画一下各个算法的学习曲线吧。<br>学习曲线是不同训练集大小，模型在训练集和验证集上的得分变化曲线。也就是以样本数为横坐标，训练和交叉验证集上的得分（如准确率）为纵坐标。learning curve可以帮助我们判断模型现在所处的状态：过拟合（overfiting / high variance） or 欠拟合（underfitting / high bias） 。[4]<br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/34.png"><br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/35.png"><br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/36.png"><br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/37.png"><br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/38.png"><br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/39.png"><br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/40.png"><br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/41.png"><br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/42.png"><br>再研究下怎么对算法进行交叉验证吧。[5]<br>所谓交叉验证，指每次训练都使用训练数据的一个划分(或折，fold):一部分作为训练集，一部分作为测试集，进行多次划分多次训练。<br>在sklearn中如果只是想把数据划分为训练集和测试集，用train_test_split。如果已经有训练集和测试集，希望用训练集训练模型后应用到测试集中，则使用交叉验证的方法来选择模型和参数。具体用cross_val_score。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 对模型进行交叉验证</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">cross_val</span>(<span class="params">model, X, Y, cv=<span class="number">5</span></span>):</span></span><br><span class="line">    scores = cross_val_score(model, X, Y, cv=cv)</span><br><span class="line">    score = scores.mean()</span><br><span class="line">    <span class="keyword">return</span> score</span><br></pre></td></tr></table></figure>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/43.png"><br>交叉验证的结果跟上面不一样了，用逻辑回归模型提交一次看看吧。<br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/44.png"><br>跟我之前用逻辑回归模型的预测结果对比一下<br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/45.png"><br>好了0.005，也就一两个预测结果的差距吧？可见模型评分高的模型未必就一定好。更何况不同的评分方法的排序结果差距很大的。<br>试试模型融合吧。<br>参考[6]<br>先筛选出最重要的几个特征，而不是把所有特征都纳入，避免过拟合。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 找出最重要的几个特征</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_top_n_features</span>(<span class="params">df_all, top_n_features</span>):</span></span><br><span class="line">    <span class="comment"># 划分数据</span></span><br><span class="line">    train_df, test_df = tools.divide_df(df_all)</span><br><span class="line">    titanic_train_data_X = train_df.drop([<span class="string">&quot;Survived&quot;</span>, <span class="string">&quot;PassengerId&quot;</span>], axis = <span class="number">1</span>)</span><br><span class="line">    titanic_train_data_Y = train_df[<span class="string">&quot;Survived&quot;</span>]</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># random forest</span></span><br><span class="line">    rf_est = RandomForestClassifier(random_state=<span class="number">0</span>)</span><br><span class="line">    rf_param_grid = &#123;<span class="string">&#x27;n_estimators&#x27;</span>: [<span class="number">500</span>], <span class="string">&#x27;min_samples_split&#x27;</span>: [<span class="number">2</span>, <span class="number">3</span>], <span class="string">&#x27;max_depth&#x27;</span>: [<span class="number">20</span>]&#125;</span><br><span class="line">    rf_grid = model_selection.GridSearchCV(rf_est, rf_param_grid, n_jobs=-<span class="number">1</span>, cv=<span class="number">10</span>, verbose=<span class="number">1</span>)</span><br><span class="line">    rf_grid.fit(titanic_train_data_X, titanic_train_data_Y)</span><br><span class="line">    print(<span class="string">&#x27;Top N Features Best RF Params:&#x27;</span> + <span class="built_in">str</span>(rf_grid.best_params_))</span><br><span class="line">    print(<span class="string">&#x27;Top N Features Best RF Score:&#x27;</span> + <span class="built_in">str</span>(rf_grid.best_score_))</span><br><span class="line">    print(<span class="string">&#x27;Top N Features RF Train Score:&#x27;</span> + <span class="built_in">str</span>(rf_grid.score(titanic_train_data_X, titanic_train_data_Y)))</span><br><span class="line">    feature_imp_sorted_rf = pd.DataFrame(&#123;<span class="string">&#x27;feature&#x27;</span>:<span class="built_in">list</span>(titanic_train_data_X), <span class="string">&#x27;importance&#x27;</span>: rf_grid.best_estimator_.feature_importances_&#125;).sort_values(<span class="string">&#x27;importance&#x27;</span>,ascending=<span class="literal">False</span>)</span><br><span class="line">    features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)[<span class="string">&#x27;feature&#x27;</span>]</span><br><span class="line">    print(<span class="string">&#x27;Sample 10 Features from RF Classifier&#x27;</span>)</span><br><span class="line">    print(<span class="built_in">str</span>(features_top_n_rf[:<span class="number">10</span>]))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># AdaBoost</span></span><br><span class="line">    ada_est =AdaBoostClassifier(random_state=<span class="number">0</span>)</span><br><span class="line">    ada_param_grid = &#123;<span class="string">&#x27;n_estimators&#x27;</span>: [<span class="number">500</span>], <span class="string">&#x27;learning_rate&#x27;</span>: [<span class="number">0.01</span>, <span class="number">0.1</span>]&#125;</span><br><span class="line">    ada_grid = model_selection.GridSearchCV(ada_est, ada_param_grid, n_jobs=-<span class="number">1</span>, cv=<span class="number">10</span>, verbose=<span class="number">1</span>)</span><br><span class="line">    ada_grid.fit(titanic_train_data_X, titanic_train_data_Y)</span><br><span class="line">    print(<span class="string">&#x27;Top N Features Best Ada Params:&#x27;</span> + <span class="built_in">str</span>(ada_grid.best_params_))</span><br><span class="line">    print(<span class="string">&#x27;Top N Features Best Ada Score:&#x27;</span> + <span class="built_in">str</span>(ada_grid.best_score_))</span><br><span class="line">    print(<span class="string">&#x27;Top N Features Ada Train Score:&#x27;</span> + <span class="built_in">str</span>(ada_grid.score(titanic_train_data_X, titanic_train_data_Y)))</span><br><span class="line">    feature_imp_sorted_ada = pd.DataFrame(&#123;<span class="string">&#x27;feature&#x27;</span>:<span class="built_in">list</span>(titanic_train_data_X), <span class="string">&#x27;importance&#x27;</span>: ada_grid.best_estimator_.feature_importances_&#125;).sort_values(<span class="string">&#x27;importance&#x27;</span>, ascending=<span class="literal">False</span>)</span><br><span class="line">    features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)[<span class="string">&#x27;feature&#x27;</span>]</span><br><span class="line">    print(<span class="string">&#x27;Sample 10 Feature from Ada Classifier:&#x27;</span>)</span><br><span class="line">    print(<span class="built_in">str</span>(features_top_n_ada[:<span class="number">10</span>]))</span><br><span class="line"></span><br><span class="line">    <span class="comment"># ExtraTree</span></span><br><span class="line">    et_est = ExtraTreesClassifier(random_state=<span class="number">0</span>)</span><br><span class="line">    et_param_grid = &#123;<span class="string">&#x27;n_estimators&#x27;</span>: [<span class="number">500</span>], <span class="string">&#x27;min_samples_split&#x27;</span>: [<span class="number">3</span>, <span class="number">4</span>], <span class="string">&#x27;max_depth&#x27;</span>: [<span class="number">20</span>]&#125;</span><br><span class="line">    et_grid = model_selection.GridSearchCV(et_est, et_param_grid, n_jobs=-<span class="number">1</span>, cv=<span class="number">10</span>, verbose=<span class="number">1</span>)</span><br><span class="line">    et_grid.fit(titanic_train_data_X, titanic_train_data_Y)</span><br><span class="line">    print(<span class="string">&#x27;Top N Features Best ET Params:&#x27;</span> + <span class="built_in">str</span>(et_grid.best_params_))</span><br><span class="line">    print(<span class="string">&#x27;Top N Features Best ET Score:&#x27;</span> + <span class="built_in">str</span>(et_grid.best_score_))</span><br><span class="line">    print(<span class="string">&#x27;Top N Features ET Train Score:&#x27;</span> + <span class="built_in">str</span>(et_grid.score(titanic_train_data_X, titanic_train_data_Y)))</span><br><span class="line">    feature_imp_sorted_et = pd.DataFrame(&#123;<span class="string">&#x27;feature&#x27;</span>:<span class="built_in">list</span>(titanic_train_data_X), <span class="string">&#x27;importance&#x27;</span>: et_grid.best_estimator_.feature_importances_&#125;).sort_values(<span class="string">&#x27;importance&#x27;</span>, ascending=<span class="literal">False</span>)</span><br><span class="line">    features_top_n_et =  feature_imp_sorted_et.head(top_n_features)[<span class="string">&#x27;feature&#x27;</span>]</span><br><span class="line">    print(<span class="string">&#x27;Sample 10 Features from ET Classifier:&#x27;</span>)</span><br><span class="line">    print(<span class="built_in">str</span>(features_top_n_et[:<span class="number">10</span>]))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># GradientBoosting</span></span><br><span class="line">    gb_est =GradientBoostingClassifier(random_state=<span class="number">0</span>)</span><br><span class="line">    gb_param_grid = &#123;<span class="string">&#x27;n_estimators&#x27;</span>: [<span class="number">500</span>], <span class="string">&#x27;learning_rate&#x27;</span>: [<span class="number">0.01</span>, <span class="number">0.1</span>], <span class="string">&#x27;max_depth&#x27;</span>: [<span class="number">20</span>]&#125;</span><br><span class="line">    gb_grid = model_selection.GridSearchCV(gb_est, gb_param_grid, n_jobs=-<span class="number">1</span>, cv=<span class="number">10</span>, verbose=<span class="number">1</span>)</span><br><span class="line">    gb_grid.fit(titanic_train_data_X, titanic_train_data_Y)</span><br><span class="line">    print(<span class="string">&#x27;Top N Features Best GB Params:&#x27;</span> + <span class="built_in">str</span>(gb_grid.best_params_))</span><br><span class="line">    print(<span class="string">&#x27;Top N Features Best GB Score:&#x27;</span> + <span class="built_in">str</span>(gb_grid.best_score_))</span><br><span class="line">    print(<span class="string">&#x27;Top N Features GB Train Score:&#x27;</span> + <span class="built_in">str</span>(gb_grid.score(titanic_train_data_X, titanic_train_data_Y)))</span><br><span class="line">    feature_imp_sorted_gb = pd.DataFrame(&#123;<span class="string">&#x27;feature&#x27;</span>:<span class="built_in">list</span>(titanic_train_data_X), <span class="string">&#x27;importance&#x27;</span>: gb_grid.best_estimator_.feature_importances_&#125;).sort_values(<span class="string">&#x27;importance&#x27;</span>, ascending=<span class="literal">False</span>)</span><br><span class="line">    features_top_n_gb = feature_imp_sorted_gb.head(top_n_features)[<span class="string">&#x27;feature&#x27;</span>]</span><br><span class="line">    print(<span class="string">&#x27;Sample 10 Feature from GB Classifier:&#x27;</span>)</span><br><span class="line">    print(<span class="built_in">str</span>(features_top_n_gb[:<span class="number">10</span>]))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># DecisionTree</span></span><br><span class="line">    dt_est = DecisionTreeClassifier(random_state=<span class="number">0</span>)</span><br><span class="line">    dt_param_grid = &#123;<span class="string">&#x27;min_samples_split&#x27;</span>: [<span class="number">2</span>, <span class="number">4</span>], <span class="string">&#x27;max_depth&#x27;</span>: [<span class="number">20</span>]&#125;</span><br><span class="line">    dt_grid = model_selection.GridSearchCV(dt_est, dt_param_grid, n_jobs=-<span class="number">1</span>, cv=<span class="number">10</span>, verbose=<span class="number">1</span>)</span><br><span class="line">    dt_grid.fit(titanic_train_data_X, titanic_train_data_Y)</span><br><span class="line">    print(<span class="string">&#x27;Top N Features Best DT Params:&#x27;</span> + <span class="built_in">str</span>(dt_grid.best_params_))</span><br><span class="line">    print(<span class="string">&#x27;Top N Features Best DT Score:&#x27;</span> + <span class="built_in">str</span>(dt_grid.best_score_))</span><br><span class="line">    print(<span class="string">&#x27;Top N Features DT Train Score:&#x27;</span> + <span class="built_in">str</span>(dt_grid.score(titanic_train_data_X, titanic_train_data_Y)))</span><br><span class="line">    feature_imp_sorted_dt = pd.DataFrame(&#123;<span class="string">&#x27;feature&#x27;</span>:<span class="built_in">list</span>(titanic_train_data_X), <span class="string">&#x27;importance&#x27;</span>: dt_grid.best_estimator_.feature_importances_&#125;).sort_values(<span class="string">&#x27;importance&#x27;</span>, ascending=<span class="literal">False</span>)</span><br><span class="line">    features_top_n_dt = feature_imp_sorted_dt.head(top_n_features)[<span class="string">&#x27;feature&#x27;</span>]</span><br><span class="line">    print(<span class="string">&#x27;Sample 10 Features from DT Classifier:&#x27;</span>)</span><br><span class="line">    print(<span class="built_in">str</span>(features_top_n_dt[:<span class="number">10</span>]))</span><br><span class="line">   </span><br><span class="line">    <span class="comment"># merge the three models</span></span><br><span class="line">    features_top_n = pd.concat([features_top_n_rf, features_top_n_ada, features_top_n_et, features_top_n_gb, features_top_n_dt],ignore_index=<span class="literal">True</span>).drop_duplicates()</span><br><span class="line">   </span><br><span class="line">    features_importance = pd.concat([feature_imp_sorted_rf,feature_imp_sorted_ada, feature_imp_sorted_et,feature_imp_sorted_gb, feature_imp_sorted_dt],ignore_index=<span class="literal">True</span>)</span><br><span class="line">   </span><br><span class="line">    <span class="keyword">return</span> features_top_n, features_importance</span><br></pre></td></tr></table></figure>
<p>然后用筛选出来的前10个特征训练模型，进行预测，提交。<br>结果……<br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/46.png"><br>跟使用全部特征的结果是一样的，没啥改进。最后试试模型融合吧。还是参考[6]。<br>模型融合有Bagging、Boosting、Stacking、Blending等方法。<br>Bagging 将多个模型，也就是多个基学习器的预测结果进行简单的加权平均或者投票。它的好处是可以并行地训练基学习器。Random Forest就用到了Bagging的思想。<br>Boosting 的思想有点像知错能改，每个基学习器是在上一个基学习器学习的基础上，对上一个基学习器的错误进行弥补。AdaBoost，Gradient Boost 就用到了这种思想。<br>Stacking是用新的次学习器去学习如何组合上一层的基学习器。如果把 Bagging 看作是多个基分类器的线性组合，那么Stacking就是多个基分类器的非线性组合。Stacking可以将学习器一层一层地堆砌起来，形成一个网状的结构。<br>Blending 和 Stacking 很相似，但同时它可以防止信息泄露的问题。<br>这里我们使用了两层的模型融合，Level 1使用了：RandomForest、AdaBoost、ExtraTrees、GBDT、DecisionTree、KNN、SVM ，一共7个模型，Level 2使用了XGBoost使用第一层预测的结果作为特征对最终的结果进行预测。<br>如果我们在Train Data上训练，然后在Train Data上预测，就会造成标签。为了避免标签，我们需要对每个基学习器使用K-fold，将K个模型对Valid Set的预测结果拼起来，作为下一层学习器的输入。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br><span class="line">52</span><br><span class="line">53</span><br><span class="line">54</span><br><span class="line">55</span><br><span class="line">56</span><br><span class="line">57</span><br><span class="line">58</span><br><span class="line">59</span><br><span class="line">60</span><br><span class="line">61</span><br><span class="line">62</span><br><span class="line">63</span><br><span class="line">64</span><br><span class="line">65</span><br><span class="line">66</span><br><span class="line">67</span><br><span class="line">68</span><br><span class="line">69</span><br><span class="line">70</span><br><span class="line">71</span><br><span class="line">72</span><br><span class="line">73</span><br><span class="line">74</span><br><span class="line">75</span><br><span class="line">76</span><br><span class="line">77</span><br><span class="line">78</span><br><span class="line">79</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> numpy <span class="keyword">as</span> np</span><br><span class="line"></span><br><span class="line"><span class="keyword">from</span> sklearn.model_selection <span class="keyword">import</span> KFold</span><br><span class="line"></span><br><span class="line"><span class="keyword">from</span> sklearn.neighbors <span class="keyword">import</span> KNeighborsClassifier</span><br><span class="line"><span class="keyword">from</span> sklearn.svm <span class="keyword">import</span> SVC</span><br><span class="line"><span class="keyword">from</span> sklearn.ensemble <span class="keyword">import</span> RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier</span><br><span class="line"><span class="keyword">from</span> sklearn.tree <span class="keyword">import</span> DecisionTreeClassifier</span><br><span class="line"></span><br><span class="line"><span class="keyword">import</span> tools</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"></span><br><span class="line">SEED = <span class="number">0</span></span><br><span class="line">NFOLDS = <span class="number">7</span></span><br><span class="line">kf = KFold(n_splits = NFOLDS, random_state = SEED, shuffle = <span class="literal">False</span>)</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_out_fold</span>(<span class="params">clf, x_train, y_train, x_test</span>):</span></span><br><span class="line">    ntrain = x_train.shape[<span class="number">0</span>]</span><br><span class="line">    ntest = x_test.shape[<span class="number">0</span>]</span><br><span class="line">    oof_train = np.zeros((ntrain, ))</span><br><span class="line">    oof_test = np.zeros((ntest, ))</span><br><span class="line">    oof_test_skf = np.empty((NFOLDS, ntest))</span><br><span class="line">   </span><br><span class="line">    <span class="keyword">for</span> i, (train_index, test_index) <span class="keyword">in</span> <span class="built_in">enumerate</span>(kf.split(x_train)):</span><br><span class="line">        x_tr = x_train[train_index]</span><br><span class="line">        y_tr = y_train[train_index]</span><br><span class="line">        x_te = x_train[test_index]</span><br><span class="line">       </span><br><span class="line">        clf.fit(x_tr, y_tr)</span><br><span class="line">       </span><br><span class="line">        oof_train[test_index] = clf.predict(x_te)</span><br><span class="line">        oof_test_skf[i, :] = clf.predict(x_test)</span><br><span class="line">    oof_test[:] = oof_test_skf.mean(axis=<span class="number">0</span>)</span><br><span class="line">    <span class="keyword">return</span> oof_train.reshape(-<span class="number">1</span>, <span class="number">1</span>), oof_test.reshape(-<span class="number">1</span>, <span class="number">1</span>)</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="comment"># 模型融合</span></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">MergeModels</span>(<span class="params">df_all, top_features</span>):</span></span><br><span class="line">    print(<span class="string">&quot;模型融合&quot;</span>)</span><br><span class="line">    <span class="comment"># level 1</span></span><br><span class="line">    rf = RandomForestClassifier(n_estimators=<span class="number">500</span>, warm_start=<span class="literal">True</span>, max_features=<span class="string">&#x27;sqrt&#x27;</span>,max_depth=<span class="number">6</span>, min_samples_split=<span class="number">3</span>, min_samples_leaf=<span class="number">2</span>, n_jobs=-<span class="number">1</span>, verbose=<span class="number">0</span>)</span><br><span class="line">    ada = AdaBoostClassifier(n_estimators=<span class="number">500</span>, learning_rate=<span class="number">0.1</span>)</span><br><span class="line">    et = ExtraTreesClassifier(n_estimators=<span class="number">500</span>, n_jobs=-<span class="number">1</span>, max_depth=<span class="number">8</span>,min_samples_leaf=<span class="number">2</span>, verbose=<span class="number">0</span>)</span><br><span class="line">    gb = GradientBoostingClassifier(n_estimators=<span class="number">500</span>, learning_rate=<span class="number">0.008</span>, min_samples_split=<span class="number">3</span>, min_samples_leaf=<span class="number">2</span>, max_depth=<span class="number">5</span>,verbose=<span class="number">0</span>)</span><br><span class="line">    dt = DecisionTreeClassifier(max_depth=<span class="number">8</span>)</span><br><span class="line">    knn = KNeighborsClassifier(n_neighbors = <span class="number">2</span>)</span><br><span class="line">    svm = SVC(kernel=<span class="string">&#x27;linear&#x27;</span>, C=<span class="number">0.025</span>)</span><br><span class="line">   </span><br><span class="line">    train_df, test_df = tools.divide_df(df_all)</span><br><span class="line">    x_train = train_df[top_features].values</span><br><span class="line">    y_train = train_df[<span class="string">&quot;Survived&quot;</span>].values</span><br><span class="line">    x_test = test_df[top_features].values</span><br><span class="line">   </span><br><span class="line">    rf_oof_train, rf_oof_test = get_out_fold(rf, x_train, y_train, x_test)</span><br><span class="line">    <span class="comment"># Random Forest</span></span><br><span class="line">    ada_oof_train, ada_oof_test = get_out_fold(ada, x_train, y_train, x_test)</span><br><span class="line">    <span class="comment"># AdaBoost</span></span><br><span class="line">    et_oof_train, et_oof_test = get_out_fold(et, x_train, y_train, x_test)</span><br><span class="line">    <span class="comment"># Extra Trees</span></span><br><span class="line">    gb_oof_train, gb_oof_test = get_out_fold(gb, x_train, y_train, x_test)</span><br><span class="line">    <span class="comment"># Gradient Boost</span></span><br><span class="line">    dt_oof_train, dt_oof_test = get_out_fold(dt, x_train, y_train, x_test)</span><br><span class="line">    <span class="comment"># Decision Tree</span></span><br><span class="line">    knn_oof_train, knn_oof_test = get_out_fold(knn, x_train, y_train, x_test)</span><br><span class="line">    <span class="comment"># KNeighbors</span></span><br><span class="line">    svm_oof_train, svm_oof_test = get_out_fold(svm, x_train, y_train, x_test)</span><br><span class="line">    <span class="comment"># Support Vector</span></span><br><span class="line">    print(<span class="string">&quot;训练完成&quot;</span>)</span><br><span class="line">接着是level2，利用XGBoost，使用第一层预测的结果作为特征对最终的结果进行预测。</span><br><span class="line">    <span class="comment"># level 2 预测并生成提交文件</span></span><br><span class="line">    x_train = np.concatenate((rf_oof_train, ada_oof_train, et_oof_train, gb_oof_train, dt_oof_train, knn_oof_train, svm_oof_train), axis=<span class="number">1</span>)</span><br><span class="line">    x_test = np.concatenate((rf_oof_test, ada_oof_test, et_oof_test, gb_oof_test, dt_oof_test, knn_oof_test, svm_oof_test), axis=<span class="number">1</span>)</span><br><span class="line">    </span><br><span class="line">    gbm = XGBClassifier( n_estimators= <span class="number">2000</span>, max_depth= <span class="number">4</span>, min_child_weight= <span class="number">2</span>, gamma=<span class="number">0.9</span>, subsample=<span class="number">0.8</span>,colsample_bytree=<span class="number">0.8</span>, objective= <span class="string">&#x27;binary:logistic&#x27;</span>, nthread= -<span class="number">1</span>,scale_pos_weight=<span class="number">1</span>).fit(x_train, y_train)</span><br><span class="line">    predictions = gbm.predict(x_test)</span><br><span class="line">    StackingSubmission = pd.DataFrame(&#123;<span class="string">&#x27;PassengerId&#x27;</span>: test_df[<span class="string">&quot;PassengerId&quot;</span>], <span class="string">&#x27;Survived&#x27;</span>: predictions&#125;)</span><br><span class="line">    StackingSubmission.to_csv(<span class="string">&#x27;StackingSubmission.csv&#x27;</span>,index=<span class="literal">False</span>,sep=<span class="string">&#x27;,&#x27;</span>)</span><br></pre></td></tr></table></figure>
<p>提交结果看看。<br><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/blog0178-QTLearn/30/47.png"><br>还是不如一开始只使用随机树森林的模型。可见关键问题还是对数据进行特征工程。好了，这篇文章已经够长了，先到这里吧。复工了，学习的时间少了很多。</p>
<p>参考文献<br>1.<a target="_blank" rel="noopener" href="https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial">https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial</a><br>2.<a target="_blank" rel="noopener" href="https://blog.csdn.net/Koala_Tree/article/details/78725881">https://blog.csdn.net/Koala_Tree/article/details/78725881</a><br>3.<a target="_blank" rel="noopener" href="https://zhuanlan.zhihu.com/p/107958980">https://zhuanlan.zhihu.com/p/107958980</a><br>4.<a target="_blank" rel="noopener" href="https://blog.csdn.net/geduo_feng/article/details/79547554">https://blog.csdn.net/geduo_feng/article/details/79547554</a><br>5.<a target="_blank" rel="noopener" href="https://blog.csdn.net/kamendula/article/details/70318639">https://blog.csdn.net/kamendula/article/details/70318639</a><br>6.<a target="_blank" rel="noopener" href="https://blog.csdn.net/Koala_Tree/article/details/78725881">https://blog.csdn.net/Koala_Tree/article/details/78725881</a></p>
<p>我发文章的四个地方，欢迎大家在朋友圈等地方分享，欢迎点“在看”。<br>我的个人博客地址：<a href="https://zwdnet.github.io/">https://zwdnet.github.io</a><br>我的知乎文章地址： <a target="_blank" rel="noopener" href="https://www.zhihu.com/people/zhao-you-min/posts">https://www.zhihu.com/people/zhao-you-min/posts</a><br>我的博客园博客地址： <a target="_blank" rel="noopener" href="https://www.cnblogs.com/zwdnet/">https://www.cnblogs.com/zwdnet/</a><br>我的微信个人订阅号：赵瑜敏的口腔医学学习园地</p>
<p><img src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/other/wx.jpg"></p>

      
    </div>
    
    
    

    

    
      <div>
        <div style="padding: 10px 0; margin: 20px auto; width: 90%; text-align: center;">
  <div>欢迎打赏！感谢支持！</div>
  <button id="rewardButton" disable="enable" onclick="var qr = document.getElementById('QR'); if (qr.style.display === 'none') {qr.style.display='block';} else {qr.style.display='none'}">
    <span>打赏</span>
  </button>
  <div id="QR" style="display: none;">

    
      <div id="wechat" style="display: inline-block">
        <img id="wechat_qr" src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/other/mm_facetoface_collect_qrcode_1542944836634.png" alt=" 微信支付"/>
        <p>微信支付</p>
      </div>
    

    
      <div id="alipay" style="display: inline-block">
        <img id="alipay_qr" src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/other/1542944857770.jpg" alt=" 支付宝"/>
        <p>支付宝</p>
      </div>
    

    

  </div>
</div>

      </div>
    

    

    <footer class="post-footer">
      
        <div class="post-tags">
          
            <a href="/tags/Python/" rel="tag"># Python</a>
          
            <a href="/tags/%E9%87%8F%E5%8C%96%E6%8A%95%E8%B5%84/" rel="tag"># 量化投资</a>
          
            <a href="/tags/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/" rel="tag"># 机器学习</a>
          
            <a href="/tags/kaggle/" rel="tag"># kaggle</a>
          
            <a href="/tags/%E5%88%86%E7%B1%BB%E7%AE%97%E6%B3%95%EF%BC%8C%E6%94%AF%E6%8C%81%E5%90%91%E9%87%8F%E6%9C%BA/" rel="tag"># 分类算法，支持向量机</a>
          
            <a href="/tags/%E5%AE%9E%E4%BE%8B/" rel="tag"># 实例</a>
          
        </div>
      

      
      
      

      
        <div class="post-nav">
          <div class="post-nav-next post-nav-item">
            
              <a href="/2020/03/22/%E4%BD%A0%E4%BC%9A%E5%A4%87%E7%89%99%E5%90%97%E2%80%94%E2%80%94%E5%A6%82%E4%BD%95%E9%98%B2%E6%AD%A2%E4%BF%AE%E5%A4%8D%E4%BD%93%E8%84%B1%E8%90%BD%EF%BC%9F/" rel="next" title="你会备牙吗——如何防止修复体脱落？">
                <i class="fa fa-chevron-left"></i> 你会备牙吗——如何防止修复体脱落？
              </a>
            
          </div>

          <span class="post-nav-divider"></span>

          <div class="post-nav-prev post-nav-item">
            
              <a href="/2020/03/23/%E7%89%99%E9%AB%93%E7%97%85%E6%80%BB%E7%BB%9312%E2%80%94%E2%80%94%E9%92%99%E5%8C%96%E6%A0%B9%E7%AE%A1%E7%9A%84%E9%97%AE%E9%A2%98/" rel="prev" title="牙髓病总结12——钙化根管的问题">
                牙髓病总结12——钙化根管的问题 <i class="fa fa-chevron-right"></i>
              </a>
            
          </div>
        </div>
      

      
      
    </footer>
  </div>
  
  
  
  </article>



    <div class="post-spread">
      
    </div>
  </div>


          </div>
          


          

  
    <div class="comments" id="comments">
      <div id="lv-container" data-id="city" data-uid="MTAyMC80MTA2Mi8xNzU4Nw=="></div>
    </div>

  



        </div>
        
          
  
  <div class="sidebar-toggle">
    <div class="sidebar-toggle-line-wrap">
      <span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
    </div>
  </div>

  <aside id="sidebar" class="sidebar">
    
    <div class="sidebar-inner">

      

      

      <section class="site-overview-wrap sidebar-panel sidebar-panel-active">
        <div class="site-overview">
          <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
            
              <img class="site-author-image" itemprop="image"
                src="https://zymblog-1258069789.cos.ap-chengdu.myqcloud.com/other/tx.jpg"
                alt="" />
            
              <p class="site-author-name" itemprop="name"></p>
              <p class="site-description motion-element" itemprop="description"></p>
          </div>

          <nav class="site-state motion-element">

            
              <div class="site-state-item site-state-posts">
              
                <a href="/archives/%20%7C%7C%20archive">
              
                  <span class="site-state-item-count">452</span>
                  <span class="site-state-item-name">日志</span>
                </a>
              </div>
            

            
              
              
              <div class="site-state-item site-state-categories">
                <a href="/categories/index.html">
                  <span class="site-state-item-count">29</span>
                  <span class="site-state-item-name">分类</span>
                </a>
              </div>
            

            
              
              
              <div class="site-state-item site-state-tags">
                <a href="/tags/index.html">
                  <span class="site-state-item-count">544</span>
                  <span class="site-state-item-name">标签</span>
                </a>
              </div>
            

          </nav>

          

          

          
          

          
          

          

        </div>
      </section>

      

      

    </div>
  </aside>


        
      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="footer-inner">
        <div class="copyright">&copy; <span itemprop="copyrightYear">2021</span>
  <span class="with-love">
    <i class="fa fa-user"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">本站版权归赵瑜敏所有，如欲转载请与本人联系。</span>

  
    <span class="post-meta-divider">|</span>
    <span class="post-meta-item-icon">
      <i class="fa fa-area-chart"></i>
    </span>
    
      <span class="post-meta-item-text">Site words total count&#58;</span>
    
    <span title="Site words total count">1225.8k</span>
  
</div>









<div>
  <script type="text/javascript">var cnzz_protocol = (("https:" == document.location.protocol) ? " https://" : " http://");document.write(unescape("%3Cspan id='cnzz_stat_icon_1275447216'%3E%3C/span%3E%3Cscript src='" + cnzz_protocol + "s11.cnzz.com/z_stat.php%3Fid%3D1275447216%26online%3D1%26show%3Dline' type='text/javascript'%3E%3C/script%3E"));</script>
</div>

        







  <div style="display: none;">
    <script src="//s95.cnzz.com/z_stat.php?id=1275447216&web_id=1275447216" language="JavaScript"></script>
  </div>



        
      </div>
    </footer>

    
      <div class="back-to-top">
        <i class="fa fa-arrow-up"></i>
        
      </div>
    

    

  </div>

  

<script type="text/javascript">
  if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
    window.Promise = null;
  }
</script>









  












  
  
    <script type="text/javascript" src="/lib/jquery/index.js?v=2.1.3"></script>
  

  
  
    <script type="text/javascript" src="/lib/fastclick/lib/fastclick.min.js?v=1.0.6"></script>
  

  
  
    <script type="text/javascript" src="/lib/jquery_lazyload/jquery.lazyload.js?v=1.9.7"></script>
  

  
  
    <script type="text/javascript" src="/lib/velocity/velocity.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/lib/fancybox/source/jquery.fancybox.pack.js?v=2.1.5"></script>
  


  


  <script type="text/javascript" src="/js/src/utils.js?v=5.1.4"></script>

  <script type="text/javascript" src="/js/src/motion.js?v=5.1.4"></script>



  
  


  <script type="text/javascript" src="/js/src/affix.js?v=5.1.4"></script>

  <script type="text/javascript" src="/js/src/schemes/pisces.js?v=5.1.4"></script>



  
  <script type="text/javascript" src="/js/src/scrollspy.js?v=5.1.4"></script>
<script type="text/javascript" src="/js/src/post-details.js?v=5.1.4"></script>



  


  <script type="text/javascript" src="/js/src/bootstrap.js?v=5.1.4"></script>



  


  




	





  





  
    <script type="text/javascript">
      (function(d, s) {
        var j, e = d.getElementsByTagName(s)[0];
        if (typeof LivereTower === 'function') { return; }
        j = d.createElement(s);
        j.src = 'https://cdn-city.livere.com/js/embed.dist.js';
        j.async = true;
        e.parentNode.insertBefore(j, e);
      })(document, 'script');
    </script>
  












  





  

  

  

  
  

  

  

  

  
</body>
</html>
